fix parser

This commit is contained in:
2026-02-02 00:05:24 +03:00
parent 82f2c9082f
commit 2d781ecfb2
3 changed files with 138 additions and 14 deletions

View File

@@ -74,6 +74,14 @@ services:
restart: unless-stopped restart: unless-stopped
ports: ports:
- "5001:5000" - "5001:5000"
environment:
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_USER=n8n
- POSTGRES_PASSWORD=n8n_password
- POSTGRES_DB=n8n
depends_on:
- postgres
networks: networks:
- app-network - app-network
healthcheck: healthcheck:

View File

@@ -12,13 +12,17 @@ import io
import os import os
import tempfile import tempfile
import base64 import base64
import json as json_module
import re import re
import logging import logging
import uuid as uuid_module
import chardet import chardet
from datetime import datetime from datetime import datetime
from typing import List, Dict, Optional, Tuple, Union, Any from typing import List, Dict, Optional, Tuple, Union, Any
from pydantic import BaseModel from pydantic import BaseModel
import psycopg2
# Настройка логирования # Настройка логирования
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -918,6 +922,100 @@ def parse_epub_content(book: epub.EpubBook) -> Tuple[str, str, Dict[str, Any], L
return title, author, all_metadata, chapters return title, author, all_metadata, chapters
def _get_postgres_connection() -> Any:
"""Создаёт подключение к Postgres из переменных окружения."""
host = os.environ.get("POSTGRES_HOST", "localhost").strip()
port = os.environ.get("POSTGRES_PORT", "5432").strip()
user = os.environ.get("POSTGRES_USER", "n8n").strip()
password = os.environ.get("POSTGRES_PASSWORD", "n8n_password").strip()
dbname = os.environ.get("POSTGRES_DB", "n8n").strip()
return psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname,
)
def save_parse_result_to_postgres(
title: str,
author: str,
metadata: Dict[str, Any],
chapters: List[Dict[str, Any]],
) -> Tuple[str, List[str]]:
"""Сохраняет результат парсинга в Postgres (books + chapters).
Схема соответствует 8_сохранение_postgres/schema.sql.
Генерирует UUID для книги и для каждой главы.
Args:
title: Название книги.
author: Автор книги.
metadata: Метаданные EPUB (books.metadata).
chapters: Список глав с chapterNumber, chapterTitle, text.
Returns:
Кортеж (book_id_uuid, [chapter_id_uuid, ...]).
Raises:
RuntimeError: Если psycopg2 недоступен или ошибка записи.
"""
book_id_uuid = str(uuid_module.uuid4())
chapter_ids: List[str] = []
conn = _get_postgres_connection()
try:
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO books (id, title, author, metadata, updated_at)
VALUES (%s, %s, %s, %s, NOW())
ON CONFLICT (id) DO UPDATE SET
title = COALESCE(EXCLUDED.title, books.title),
author = COALESCE(EXCLUDED.author, books.author),
metadata = COALESCE(EXCLUDED.metadata, books.metadata),
updated_at = NOW()
""",
(
book_id_uuid,
title or "",
author or "",
json_module.dumps(metadata or {}, ensure_ascii=False),
),
)
for ch in chapters:
chapter_id_uuid = str(uuid_module.uuid4())
chapter_ids.append(chapter_id_uuid)
cur.execute(
"""
INSERT INTO chapters (id, book_id, chapter_number, chapter_title, content)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
book_id = EXCLUDED.book_id,
chapter_number = EXCLUDED.chapter_number,
chapter_title = COALESCE(EXCLUDED.chapter_title, chapters.chapter_title),
content = COALESCE(EXCLUDED.content, chapters.content)
""",
(
chapter_id_uuid,
book_id_uuid,
int(ch.get("chapterNumber", 0)),
(ch.get("chapterTitle") or ""),
(ch.get("text") or ""),
),
)
conn.commit()
except Exception as e:
conn.rollback()
logger.exception("Ошибка сохранения в Postgres")
raise RuntimeError(f"Ошибка сохранения в Postgres: {e}") from e
finally:
conn.close()
return book_id_uuid, chapter_ids
class FileDataRequest(BaseModel): class FileDataRequest(BaseModel):
file_data: str # base64 encoded file_data: str # base64 encoded
@@ -1008,23 +1106,40 @@ async def parse_epub(
# Формируем результат - title и author ВСЕГДА присутствуют # Формируем результат - title и author ВСЕГДА присутствуют
book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}" book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}"
result_chapters: List[Dict[str, Any]] = list(chapters)
# Подсчитываем общее количество токенов для всей книги
total_tokens = sum(chapter.get('tokenCount', 0) for chapter in chapters) # По умолчанию сохраняем в Postgres (books + chapters)
saved_to_postgres = False
if not error_info and chapters:
try:
pg_book_id, pg_chapter_ids = save_parse_result_to_postgres(
title, author, metadata, chapters
)
book_id = pg_book_id
result_chapters = [
{**ch, "chapter_id": pg_chapter_ids[i] if i < len(pg_chapter_ids) else ""}
for i, ch in enumerate(chapters)
]
saved_to_postgres = True
except Exception as e:
logger.warning("Сохранение в Postgres не удалось: %s", e)
total_tokens = sum(c.get("tokenCount", 0) for c in result_chapters)
result: Dict[str, Any] = { result: Dict[str, Any] = {
'title': title, "title": title,
'author': author, "author": author,
'bookId': book_id, "bookId": book_id,
'totalChapters': len(chapters), "totalChapters": len(result_chapters),
'totalTokens': total_tokens, "totalTokens": total_tokens,
'metadata': metadata, "metadata": metadata,
'chapters': chapters "chapters": result_chapters,
} }
if saved_to_postgres:
result["book_id"] = book_id
if error_info: if error_info:
result['error'] = error_info result["error"] = error_info
return result return result
except HTTPException: except HTTPException:

View File

@@ -6,3 +6,4 @@ lxml==5.1.0
chardet==5.2.0 chardet==5.2.0
python-multipart==0.0.6 python-multipart==0.0.6
tiktoken==0.5.2 tiktoken==0.5.2
psycopg2-binary==2.9.9