fix parser
This commit is contained in:
@@ -12,13 +12,17 @@ import io
|
||||
import os
|
||||
import tempfile
|
||||
import base64
|
||||
import json as json_module
|
||||
import re
|
||||
import logging
|
||||
import uuid as uuid_module
|
||||
import chardet
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional, Tuple, Union, Any
|
||||
from pydantic import BaseModel
|
||||
|
||||
import psycopg2
|
||||
|
||||
# Настройка логирования
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -918,6 +922,100 @@ def parse_epub_content(book: epub.EpubBook) -> Tuple[str, str, Dict[str, Any], L
|
||||
return title, author, all_metadata, chapters
|
||||
|
||||
|
||||
def _get_postgres_connection() -> Any:
|
||||
"""Создаёт подключение к Postgres из переменных окружения."""
|
||||
host = os.environ.get("POSTGRES_HOST", "localhost").strip()
|
||||
port = os.environ.get("POSTGRES_PORT", "5432").strip()
|
||||
user = os.environ.get("POSTGRES_USER", "n8n").strip()
|
||||
password = os.environ.get("POSTGRES_PASSWORD", "n8n_password").strip()
|
||||
dbname = os.environ.get("POSTGRES_DB", "n8n").strip()
|
||||
return psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname,
|
||||
)
|
||||
|
||||
|
||||
def save_parse_result_to_postgres(
|
||||
title: str,
|
||||
author: str,
|
||||
metadata: Dict[str, Any],
|
||||
chapters: List[Dict[str, Any]],
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""Сохраняет результат парсинга в Postgres (books + chapters).
|
||||
|
||||
Схема соответствует 8_сохранение_postgres/schema.sql.
|
||||
Генерирует UUID для книги и для каждой главы.
|
||||
|
||||
Args:
|
||||
title: Название книги.
|
||||
author: Автор книги.
|
||||
metadata: Метаданные EPUB (books.metadata).
|
||||
chapters: Список глав с chapterNumber, chapterTitle, text.
|
||||
|
||||
Returns:
|
||||
Кортеж (book_id_uuid, [chapter_id_uuid, ...]).
|
||||
|
||||
Raises:
|
||||
RuntimeError: Если psycopg2 недоступен или ошибка записи.
|
||||
"""
|
||||
book_id_uuid = str(uuid_module.uuid4())
|
||||
chapter_ids: List[str] = []
|
||||
|
||||
conn = _get_postgres_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO books (id, title, author, metadata, updated_at)
|
||||
VALUES (%s, %s, %s, %s, NOW())
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
title = COALESCE(EXCLUDED.title, books.title),
|
||||
author = COALESCE(EXCLUDED.author, books.author),
|
||||
metadata = COALESCE(EXCLUDED.metadata, books.metadata),
|
||||
updated_at = NOW()
|
||||
""",
|
||||
(
|
||||
book_id_uuid,
|
||||
title or "",
|
||||
author or "",
|
||||
json_module.dumps(metadata or {}, ensure_ascii=False),
|
||||
),
|
||||
)
|
||||
for ch in chapters:
|
||||
chapter_id_uuid = str(uuid_module.uuid4())
|
||||
chapter_ids.append(chapter_id_uuid)
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO chapters (id, book_id, chapter_number, chapter_title, content)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
book_id = EXCLUDED.book_id,
|
||||
chapter_number = EXCLUDED.chapter_number,
|
||||
chapter_title = COALESCE(EXCLUDED.chapter_title, chapters.chapter_title),
|
||||
content = COALESCE(EXCLUDED.content, chapters.content)
|
||||
""",
|
||||
(
|
||||
chapter_id_uuid,
|
||||
book_id_uuid,
|
||||
int(ch.get("chapterNumber", 0)),
|
||||
(ch.get("chapterTitle") or ""),
|
||||
(ch.get("text") or ""),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.exception("Ошибка сохранения в Postgres")
|
||||
raise RuntimeError(f"Ошибка сохранения в Postgres: {e}") from e
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return book_id_uuid, chapter_ids
|
||||
|
||||
|
||||
class FileDataRequest(BaseModel):
|
||||
file_data: str # base64 encoded
|
||||
|
||||
@@ -1008,23 +1106,40 @@ async def parse_epub(
|
||||
|
||||
# Формируем результат - title и author ВСЕГДА присутствуют
|
||||
book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}"
|
||||
|
||||
# Подсчитываем общее количество токенов для всей книги
|
||||
total_tokens = sum(chapter.get('tokenCount', 0) for chapter in chapters)
|
||||
|
||||
result_chapters: List[Dict[str, Any]] = list(chapters)
|
||||
|
||||
# По умолчанию сохраняем в Postgres (books + chapters)
|
||||
saved_to_postgres = False
|
||||
if not error_info and chapters:
|
||||
try:
|
||||
pg_book_id, pg_chapter_ids = save_parse_result_to_postgres(
|
||||
title, author, metadata, chapters
|
||||
)
|
||||
book_id = pg_book_id
|
||||
result_chapters = [
|
||||
{**ch, "chapter_id": pg_chapter_ids[i] if i < len(pg_chapter_ids) else ""}
|
||||
for i, ch in enumerate(chapters)
|
||||
]
|
||||
saved_to_postgres = True
|
||||
except Exception as e:
|
||||
logger.warning("Сохранение в Postgres не удалось: %s", e)
|
||||
|
||||
total_tokens = sum(c.get("tokenCount", 0) for c in result_chapters)
|
||||
result: Dict[str, Any] = {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'bookId': book_id,
|
||||
'totalChapters': len(chapters),
|
||||
'totalTokens': total_tokens,
|
||||
'metadata': metadata,
|
||||
'chapters': chapters
|
||||
"title": title,
|
||||
"author": author,
|
||||
"bookId": book_id,
|
||||
"totalChapters": len(result_chapters),
|
||||
"totalTokens": total_tokens,
|
||||
"metadata": metadata,
|
||||
"chapters": result_chapters,
|
||||
}
|
||||
|
||||
if saved_to_postgres:
|
||||
result["book_id"] = book_id
|
||||
|
||||
if error_info:
|
||||
result['error'] = error_info
|
||||
|
||||
result["error"] = error_info
|
||||
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
|
||||
@@ -6,3 +6,4 @@ lxml==5.1.0
|
||||
chardet==5.2.0
|
||||
python-multipart==0.0.6
|
||||
tiktoken==0.5.2
|
||||
psycopg2-binary==2.9.9
|
||||
|
||||
Reference in New Issue
Block a user