fix parser

2026-02-02 00:05:24 +03:00
parent 82f2c9082f
commit 2d781ecfb2
3 changed files with 138 additions and 14 deletions
--- a/epub-parser/app.py
+++ b/epub-parser/app.py
@@ -12,13 +12,17 @@ import io
 import os
 import tempfile
 import base64
+import json as json_module
 import re
 import logging
+import uuid as uuid_module
 import chardet
 from datetime import datetime
 from typing import List, Dict, Optional, Tuple, Union, Any
 from pydantic import BaseModel

+import psycopg2
+
 # Настройка логирования
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -918,6 +922,100 @@ def parse_epub_content(book: epub.EpubBook) -> Tuple[str, str, Dict[str, Any], L
    return title, author, all_metadata, chapters


+def _get_postgres_connection() -> Any:
+    """Создаёт подключение к Postgres из переменных окружения."""
+    host = os.environ.get("POSTGRES_HOST", "localhost").strip()
+    port = os.environ.get("POSTGRES_PORT", "5432").strip()
+    user = os.environ.get("POSTGRES_USER", "n8n").strip()
+    password = os.environ.get("POSTGRES_PASSWORD", "n8n_password").strip()
+    dbname = os.environ.get("POSTGRES_DB", "n8n").strip()
+    return psycopg2.connect(
+        host=host,
+        port=port,
+        user=user,
+        password=password,
+        dbname=dbname,
+    )
+
+
+def save_parse_result_to_postgres(
+    title: str,
+    author: str,
+    metadata: Dict[str, Any],
+    chapters: List[Dict[str, Any]],
+) -> Tuple[str, List[str]]:
+    """Сохраняет результат парсинга в Postgres (books + chapters).
+
+    Схема соответствует 8_сохранение_postgres/schema.sql.
+    Генерирует UUID для книги и для каждой главы.
+
+    Args:
+        title: Название книги.
+        author: Автор книги.
+        metadata: Метаданные EPUB (books.metadata).
+        chapters: Список глав с chapterNumber, chapterTitle, text.
+
+    Returns:
+        Кортеж (book_id_uuid, [chapter_id_uuid, ...]).
+
+    Raises:
+        RuntimeError: Если psycopg2 недоступен или ошибка записи.
+    """
+    book_id_uuid = str(uuid_module.uuid4())
+    chapter_ids: List[str] = []
+
+    conn = _get_postgres_connection()
+    try:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                INSERT INTO books (id, title, author, metadata, updated_at)
+                VALUES (%s, %s, %s, %s, NOW())
+                ON CONFLICT (id) DO UPDATE SET
+                    title = COALESCE(EXCLUDED.title, books.title),
+                    author = COALESCE(EXCLUDED.author, books.author),
+                    metadata = COALESCE(EXCLUDED.metadata, books.metadata),
+                    updated_at = NOW()
+                """,
+                (
+                    book_id_uuid,
+                    title or "",
+                    author or "",
+                    json_module.dumps(metadata or {}, ensure_ascii=False),
+                ),
+            )
+            for ch in chapters:
+                chapter_id_uuid = str(uuid_module.uuid4())
+                chapter_ids.append(chapter_id_uuid)
+                cur.execute(
+                    """
+                    INSERT INTO chapters (id, book_id, chapter_number, chapter_title, content)
+                    VALUES (%s, %s, %s, %s, %s)
+                    ON CONFLICT (id) DO UPDATE SET
+                        book_id = EXCLUDED.book_id,
+                        chapter_number = EXCLUDED.chapter_number,
+                        chapter_title = COALESCE(EXCLUDED.chapter_title, chapters.chapter_title),
+                        content = COALESCE(EXCLUDED.content, chapters.content)
+                    """,
+                    (
+                        chapter_id_uuid,
+                        book_id_uuid,
+                        int(ch.get("chapterNumber", 0)),
+                        (ch.get("chapterTitle") or ""),
+                        (ch.get("text") or ""),
+                    ),
+                )
+        conn.commit()
+    except Exception as e:
+        conn.rollback()
+        logger.exception("Ошибка сохранения в Postgres")
+        raise RuntimeError(f"Ошибка сохранения в Postgres: {e}") from e
+    finally:
+        conn.close()
+
+    return book_id_uuid, chapter_ids
+
+
 class FileDataRequest(BaseModel):
    file_data: str  # base64 encoded

@@ -1008,23 +1106,40 @@ async def parse_epub(
        
        # Формируем результат - title и author ВСЕГДА присутствуют
        book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}"
-        
-        # Подсчитываем общее количество токенов для всей книги
-        total_tokens = sum(chapter.get('tokenCount', 0) for chapter in chapters)
-        
+        result_chapters: List[Dict[str, Any]] = list(chapters)
+
+        # По умолчанию сохраняем в Postgres (books + chapters)
+        saved_to_postgres = False
+        if not error_info and chapters:
+            try:
+                pg_book_id, pg_chapter_ids = save_parse_result_to_postgres(
+                    title, author, metadata, chapters
+                )
+                book_id = pg_book_id
+                result_chapters = [
+                    {**ch, "chapter_id": pg_chapter_ids[i] if i < len(pg_chapter_ids) else ""}
+                    for i, ch in enumerate(chapters)
+                ]
+                saved_to_postgres = True
+            except Exception as e:
+                logger.warning("Сохранение в Postgres не удалось: %s", e)
+
+        total_tokens = sum(c.get("tokenCount", 0) for c in result_chapters)
        result: Dict[str, Any] = {
-            'title': title,
-            'author': author,
-            'bookId': book_id,
-            'totalChapters': len(chapters),
-            'totalTokens': total_tokens,
-            'metadata': metadata,
-            'chapters': chapters
+            "title": title,
+            "author": author,
+            "bookId": book_id,
+            "totalChapters": len(result_chapters),
+            "totalTokens": total_tokens,
+            "metadata": metadata,
+            "chapters": result_chapters,
        }
-        
+        if saved_to_postgres:
+            result["book_id"] = book_id
+
        if error_info:
-            result['error'] = error_info
-        
+            result["error"] = error_info
+
        return result
        
    except HTTPException:
--- a/epub-parser/requirements.txt
+++ b/epub-parser/requirements.txt
@@ -6,3 +6,4 @@ lxml==5.1.0
 chardet==5.2.0
 python-multipart==0.0.6
 tiktoken==0.5.2
+psycopg2-binary==2.9.9