fix parser

This commit is contained in:
2026-02-02 00:05:24 +03:00
parent 82f2c9082f
commit 2d781ecfb2
3 changed files with 138 additions and 14 deletions

View File

@@ -74,6 +74,14 @@ services:
restart: unless-stopped
ports:
- "5001:5000"
environment:
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_USER=n8n
- POSTGRES_PASSWORD=n8n_password
- POSTGRES_DB=n8n
depends_on:
- postgres
networks:
- app-network
healthcheck:

View File

@@ -12,13 +12,17 @@ import io
import os
import tempfile
import base64
import json as json_module
import re
import logging
import uuid as uuid_module
import chardet
from datetime import datetime
from typing import List, Dict, Optional, Tuple, Union, Any
from pydantic import BaseModel
import psycopg2
# Настройка логирования
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -918,6 +922,100 @@ def parse_epub_content(book: epub.EpubBook) -> Tuple[str, str, Dict[str, Any], L
return title, author, all_metadata, chapters
def _get_postgres_connection() -> Any:
"""Создаёт подключение к Postgres из переменных окружения."""
host = os.environ.get("POSTGRES_HOST", "localhost").strip()
port = os.environ.get("POSTGRES_PORT", "5432").strip()
user = os.environ.get("POSTGRES_USER", "n8n").strip()
password = os.environ.get("POSTGRES_PASSWORD", "n8n_password").strip()
dbname = os.environ.get("POSTGRES_DB", "n8n").strip()
return psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname,
)
def save_parse_result_to_postgres(
title: str,
author: str,
metadata: Dict[str, Any],
chapters: List[Dict[str, Any]],
) -> Tuple[str, List[str]]:
"""Сохраняет результат парсинга в Postgres (books + chapters).
Схема соответствует 8_сохранение_postgres/schema.sql.
Генерирует UUID для книги и для каждой главы.
Args:
title: Название книги.
author: Автор книги.
metadata: Метаданные EPUB (books.metadata).
chapters: Список глав с chapterNumber, chapterTitle, text.
Returns:
Кортеж (book_id_uuid, [chapter_id_uuid, ...]).
Raises:
RuntimeError: Если psycopg2 недоступен или ошибка записи.
"""
book_id_uuid = str(uuid_module.uuid4())
chapter_ids: List[str] = []
conn = _get_postgres_connection()
try:
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO books (id, title, author, metadata, updated_at)
VALUES (%s, %s, %s, %s, NOW())
ON CONFLICT (id) DO UPDATE SET
title = COALESCE(EXCLUDED.title, books.title),
author = COALESCE(EXCLUDED.author, books.author),
metadata = COALESCE(EXCLUDED.metadata, books.metadata),
updated_at = NOW()
""",
(
book_id_uuid,
title or "",
author or "",
json_module.dumps(metadata or {}, ensure_ascii=False),
),
)
for ch in chapters:
chapter_id_uuid = str(uuid_module.uuid4())
chapter_ids.append(chapter_id_uuid)
cur.execute(
"""
INSERT INTO chapters (id, book_id, chapter_number, chapter_title, content)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
book_id = EXCLUDED.book_id,
chapter_number = EXCLUDED.chapter_number,
chapter_title = COALESCE(EXCLUDED.chapter_title, chapters.chapter_title),
content = COALESCE(EXCLUDED.content, chapters.content)
""",
(
chapter_id_uuid,
book_id_uuid,
int(ch.get("chapterNumber", 0)),
(ch.get("chapterTitle") or ""),
(ch.get("text") or ""),
),
)
conn.commit()
except Exception as e:
conn.rollback()
logger.exception("Ошибка сохранения в Postgres")
raise RuntimeError(f"Ошибка сохранения в Postgres: {e}") from e
finally:
conn.close()
return book_id_uuid, chapter_ids
class FileDataRequest(BaseModel):
file_data: str # base64 encoded
@@ -1008,23 +1106,40 @@ async def parse_epub(
# Формируем результат - title и author ВСЕГДА присутствуют
book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}"
# Подсчитываем общее количество токенов для всей книги
total_tokens = sum(chapter.get('tokenCount', 0) for chapter in chapters)
result_chapters: List[Dict[str, Any]] = list(chapters)
# По умолчанию сохраняем в Postgres (books + chapters)
saved_to_postgres = False
if not error_info and chapters:
try:
pg_book_id, pg_chapter_ids = save_parse_result_to_postgres(
title, author, metadata, chapters
)
book_id = pg_book_id
result_chapters = [
{**ch, "chapter_id": pg_chapter_ids[i] if i < len(pg_chapter_ids) else ""}
for i, ch in enumerate(chapters)
]
saved_to_postgres = True
except Exception as e:
logger.warning("Сохранение в Postgres не удалось: %s", e)
total_tokens = sum(c.get("tokenCount", 0) for c in result_chapters)
result: Dict[str, Any] = {
'title': title,
'author': author,
'bookId': book_id,
'totalChapters': len(chapters),
'totalTokens': total_tokens,
'metadata': metadata,
'chapters': chapters
"title": title,
"author": author,
"bookId": book_id,
"totalChapters": len(result_chapters),
"totalTokens": total_tokens,
"metadata": metadata,
"chapters": result_chapters,
}
if saved_to_postgres:
result["book_id"] = book_id
if error_info:
result['error'] = error_info
result["error"] = error_info
return result
except HTTPException:

View File

@@ -6,3 +6,4 @@ lxml==5.1.0
chardet==5.2.0
python-multipart==0.0.6
tiktoken==0.5.2
psycopg2-binary==2.9.9