From 2d781ecfb23355f833046a2193e0841a586acd14 Mon Sep 17 00:00:00 2001 From: Shuvalov Evgeny Date: Mon, 2 Feb 2026 00:05:24 +0300 Subject: [PATCH] fix parser --- docker-compose.yml | 8 ++ epub-parser/app.py | 143 +++++++++++++++++++++++++++++++---- epub-parser/requirements.txt | 1 + 3 files changed, 138 insertions(+), 14 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 7cc1030..70ce0ea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -74,6 +74,14 @@ services: restart: unless-stopped ports: - "5001:5000" + environment: + - POSTGRES_HOST=postgres + - POSTGRES_PORT=5432 + - POSTGRES_USER=n8n + - POSTGRES_PASSWORD=n8n_password + - POSTGRES_DB=n8n + depends_on: + - postgres networks: - app-network healthcheck: diff --git a/epub-parser/app.py b/epub-parser/app.py index 4d860d5..1fcd428 100644 --- a/epub-parser/app.py +++ b/epub-parser/app.py @@ -12,13 +12,17 @@ import io import os import tempfile import base64 +import json as json_module import re import logging +import uuid as uuid_module import chardet from datetime import datetime from typing import List, Dict, Optional, Tuple, Union, Any from pydantic import BaseModel +import psycopg2 + # Настройка логирования logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -918,6 +922,100 @@ def parse_epub_content(book: epub.EpubBook) -> Tuple[str, str, Dict[str, Any], L return title, author, all_metadata, chapters +def _get_postgres_connection() -> Any: + """Создаёт подключение к Postgres из переменных окружения.""" + host = os.environ.get("POSTGRES_HOST", "localhost").strip() + port = os.environ.get("POSTGRES_PORT", "5432").strip() + user = os.environ.get("POSTGRES_USER", "n8n").strip() + password = os.environ.get("POSTGRES_PASSWORD", "n8n_password").strip() + dbname = os.environ.get("POSTGRES_DB", "n8n").strip() + return psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname, + ) + + +def save_parse_result_to_postgres( + title: str, + author: str, + metadata: Dict[str, Any], + chapters: List[Dict[str, Any]], +) -> Tuple[str, List[str]]: + """Сохраняет результат парсинга в Postgres (books + chapters). + + Схема соответствует 8_сохранение_postgres/schema.sql. + Генерирует UUID для книги и для каждой главы. + + Args: + title: Название книги. + author: Автор книги. + metadata: Метаданные EPUB (books.metadata). + chapters: Список глав с chapterNumber, chapterTitle, text. + + Returns: + Кортеж (book_id_uuid, [chapter_id_uuid, ...]). + + Raises: + RuntimeError: Если psycopg2 недоступен или ошибка записи. + """ + book_id_uuid = str(uuid_module.uuid4()) + chapter_ids: List[str] = [] + + conn = _get_postgres_connection() + try: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO books (id, title, author, metadata, updated_at) + VALUES (%s, %s, %s, %s, NOW()) + ON CONFLICT (id) DO UPDATE SET + title = COALESCE(EXCLUDED.title, books.title), + author = COALESCE(EXCLUDED.author, books.author), + metadata = COALESCE(EXCLUDED.metadata, books.metadata), + updated_at = NOW() + """, + ( + book_id_uuid, + title or "", + author or "", + json_module.dumps(metadata or {}, ensure_ascii=False), + ), + ) + for ch in chapters: + chapter_id_uuid = str(uuid_module.uuid4()) + chapter_ids.append(chapter_id_uuid) + cur.execute( + """ + INSERT INTO chapters (id, book_id, chapter_number, chapter_title, content) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (id) DO UPDATE SET + book_id = EXCLUDED.book_id, + chapter_number = EXCLUDED.chapter_number, + chapter_title = COALESCE(EXCLUDED.chapter_title, chapters.chapter_title), + content = COALESCE(EXCLUDED.content, chapters.content) + """, + ( + chapter_id_uuid, + book_id_uuid, + int(ch.get("chapterNumber", 0)), + (ch.get("chapterTitle") or ""), + (ch.get("text") or ""), + ), + ) + conn.commit() + except Exception as e: + conn.rollback() + logger.exception("Ошибка сохранения в Postgres") + raise RuntimeError(f"Ошибка сохранения в Postgres: {e}") from e + finally: + conn.close() + + return book_id_uuid, chapter_ids + + class FileDataRequest(BaseModel): file_data: str # base64 encoded @@ -1008,23 +1106,40 @@ async def parse_epub( # Формируем результат - title и author ВСЕГДА присутствуют book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}" - - # Подсчитываем общее количество токенов для всей книги - total_tokens = sum(chapter.get('tokenCount', 0) for chapter in chapters) - + result_chapters: List[Dict[str, Any]] = list(chapters) + + # По умолчанию сохраняем в Postgres (books + chapters) + saved_to_postgres = False + if not error_info and chapters: + try: + pg_book_id, pg_chapter_ids = save_parse_result_to_postgres( + title, author, metadata, chapters + ) + book_id = pg_book_id + result_chapters = [ + {**ch, "chapter_id": pg_chapter_ids[i] if i < len(pg_chapter_ids) else ""} + for i, ch in enumerate(chapters) + ] + saved_to_postgres = True + except Exception as e: + logger.warning("Сохранение в Postgres не удалось: %s", e) + + total_tokens = sum(c.get("tokenCount", 0) for c in result_chapters) result: Dict[str, Any] = { - 'title': title, - 'author': author, - 'bookId': book_id, - 'totalChapters': len(chapters), - 'totalTokens': total_tokens, - 'metadata': metadata, - 'chapters': chapters + "title": title, + "author": author, + "bookId": book_id, + "totalChapters": len(result_chapters), + "totalTokens": total_tokens, + "metadata": metadata, + "chapters": result_chapters, } - + if saved_to_postgres: + result["book_id"] = book_id + if error_info: - result['error'] = error_info - + result["error"] = error_info + return result except HTTPException: diff --git a/epub-parser/requirements.txt b/epub-parser/requirements.txt index bca3fe2..9269322 100644 --- a/epub-parser/requirements.txt +++ b/epub-parser/requirements.txt @@ -6,3 +6,4 @@ lxml==5.1.0 chardet==5.2.0 python-multipart==0.0.6 tiktoken==0.5.2 +psycopg2-binary==2.9.9