fix parser
This commit is contained in:
@@ -74,6 +74,14 @@ services:
|
|||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- "5001:5000"
|
- "5001:5000"
|
||||||
|
environment:
|
||||||
|
- POSTGRES_HOST=postgres
|
||||||
|
- POSTGRES_PORT=5432
|
||||||
|
- POSTGRES_USER=n8n
|
||||||
|
- POSTGRES_PASSWORD=n8n_password
|
||||||
|
- POSTGRES_DB=n8n
|
||||||
|
depends_on:
|
||||||
|
- postgres
|
||||||
networks:
|
networks:
|
||||||
- app-network
|
- app-network
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
|||||||
@@ -12,13 +12,17 @@ import io
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import base64
|
import base64
|
||||||
|
import json as json_module
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
import uuid as uuid_module
|
||||||
import chardet
|
import chardet
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Dict, Optional, Tuple, Union, Any
|
from typing import List, Dict, Optional, Tuple, Union, Any
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
# Настройка логирования
|
# Настройка логирования
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -918,6 +922,100 @@ def parse_epub_content(book: epub.EpubBook) -> Tuple[str, str, Dict[str, Any], L
|
|||||||
return title, author, all_metadata, chapters
|
return title, author, all_metadata, chapters
|
||||||
|
|
||||||
|
|
||||||
|
def _get_postgres_connection() -> Any:
|
||||||
|
"""Создаёт подключение к Postgres из переменных окружения."""
|
||||||
|
host = os.environ.get("POSTGRES_HOST", "localhost").strip()
|
||||||
|
port = os.environ.get("POSTGRES_PORT", "5432").strip()
|
||||||
|
user = os.environ.get("POSTGRES_USER", "n8n").strip()
|
||||||
|
password = os.environ.get("POSTGRES_PASSWORD", "n8n_password").strip()
|
||||||
|
dbname = os.environ.get("POSTGRES_DB", "n8n").strip()
|
||||||
|
return psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_parse_result_to_postgres(
|
||||||
|
title: str,
|
||||||
|
author: str,
|
||||||
|
metadata: Dict[str, Any],
|
||||||
|
chapters: List[Dict[str, Any]],
|
||||||
|
) -> Tuple[str, List[str]]:
|
||||||
|
"""Сохраняет результат парсинга в Postgres (books + chapters).
|
||||||
|
|
||||||
|
Схема соответствует 8_сохранение_postgres/schema.sql.
|
||||||
|
Генерирует UUID для книги и для каждой главы.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Название книги.
|
||||||
|
author: Автор книги.
|
||||||
|
metadata: Метаданные EPUB (books.metadata).
|
||||||
|
chapters: Список глав с chapterNumber, chapterTitle, text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Кортеж (book_id_uuid, [chapter_id_uuid, ...]).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: Если psycopg2 недоступен или ошибка записи.
|
||||||
|
"""
|
||||||
|
book_id_uuid = str(uuid_module.uuid4())
|
||||||
|
chapter_ids: List[str] = []
|
||||||
|
|
||||||
|
conn = _get_postgres_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO books (id, title, author, metadata, updated_at)
|
||||||
|
VALUES (%s, %s, %s, %s, NOW())
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
title = COALESCE(EXCLUDED.title, books.title),
|
||||||
|
author = COALESCE(EXCLUDED.author, books.author),
|
||||||
|
metadata = COALESCE(EXCLUDED.metadata, books.metadata),
|
||||||
|
updated_at = NOW()
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
book_id_uuid,
|
||||||
|
title or "",
|
||||||
|
author or "",
|
||||||
|
json_module.dumps(metadata or {}, ensure_ascii=False),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for ch in chapters:
|
||||||
|
chapter_id_uuid = str(uuid_module.uuid4())
|
||||||
|
chapter_ids.append(chapter_id_uuid)
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO chapters (id, book_id, chapter_number, chapter_title, content)
|
||||||
|
VALUES (%s, %s, %s, %s, %s)
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
book_id = EXCLUDED.book_id,
|
||||||
|
chapter_number = EXCLUDED.chapter_number,
|
||||||
|
chapter_title = COALESCE(EXCLUDED.chapter_title, chapters.chapter_title),
|
||||||
|
content = COALESCE(EXCLUDED.content, chapters.content)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
chapter_id_uuid,
|
||||||
|
book_id_uuid,
|
||||||
|
int(ch.get("chapterNumber", 0)),
|
||||||
|
(ch.get("chapterTitle") or ""),
|
||||||
|
(ch.get("text") or ""),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
logger.exception("Ошибка сохранения в Postgres")
|
||||||
|
raise RuntimeError(f"Ошибка сохранения в Postgres: {e}") from e
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return book_id_uuid, chapter_ids
|
||||||
|
|
||||||
|
|
||||||
class FileDataRequest(BaseModel):
|
class FileDataRequest(BaseModel):
|
||||||
file_data: str # base64 encoded
|
file_data: str # base64 encoded
|
||||||
|
|
||||||
@@ -1008,22 +1106,39 @@ async def parse_epub(
|
|||||||
|
|
||||||
# Формируем результат - title и author ВСЕГДА присутствуют
|
# Формируем результат - title и author ВСЕГДА присутствуют
|
||||||
book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}"
|
book_id: str = f"{title}_{int(datetime.now().timestamp() * 1000)}"
|
||||||
|
result_chapters: List[Dict[str, Any]] = list(chapters)
|
||||||
|
|
||||||
# Подсчитываем общее количество токенов для всей книги
|
# По умолчанию сохраняем в Postgres (books + chapters)
|
||||||
total_tokens = sum(chapter.get('tokenCount', 0) for chapter in chapters)
|
saved_to_postgres = False
|
||||||
|
if not error_info and chapters:
|
||||||
|
try:
|
||||||
|
pg_book_id, pg_chapter_ids = save_parse_result_to_postgres(
|
||||||
|
title, author, metadata, chapters
|
||||||
|
)
|
||||||
|
book_id = pg_book_id
|
||||||
|
result_chapters = [
|
||||||
|
{**ch, "chapter_id": pg_chapter_ids[i] if i < len(pg_chapter_ids) else ""}
|
||||||
|
for i, ch in enumerate(chapters)
|
||||||
|
]
|
||||||
|
saved_to_postgres = True
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Сохранение в Postgres не удалось: %s", e)
|
||||||
|
|
||||||
|
total_tokens = sum(c.get("tokenCount", 0) for c in result_chapters)
|
||||||
result: Dict[str, Any] = {
|
result: Dict[str, Any] = {
|
||||||
'title': title,
|
"title": title,
|
||||||
'author': author,
|
"author": author,
|
||||||
'bookId': book_id,
|
"bookId": book_id,
|
||||||
'totalChapters': len(chapters),
|
"totalChapters": len(result_chapters),
|
||||||
'totalTokens': total_tokens,
|
"totalTokens": total_tokens,
|
||||||
'metadata': metadata,
|
"metadata": metadata,
|
||||||
'chapters': chapters
|
"chapters": result_chapters,
|
||||||
}
|
}
|
||||||
|
if saved_to_postgres:
|
||||||
|
result["book_id"] = book_id
|
||||||
|
|
||||||
if error_info:
|
if error_info:
|
||||||
result['error'] = error_info
|
result["error"] = error_info
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ lxml==5.1.0
|
|||||||
chardet==5.2.0
|
chardet==5.2.0
|
||||||
python-multipart==0.0.6
|
python-multipart==0.0.6
|
||||||
tiktoken==0.5.2
|
tiktoken==0.5.2
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
|
|||||||
Reference in New Issue
Block a user