init

2026-02-01 17:01:21 +03:00
commit 9575eaf8ee
144 changed files with 24025 additions and 0 deletions
--- a/epub-parser/statistics.py
+++ b/epub-parser/statistics.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Скрипт для получения статистики по главам EPUB книги.
+
+Использование:
+    python statistics.py <путь_к_epub_файлу>
+    python statistics.py book.epub
+"""
+import sys
+import json
+import argparse
+import re
+from pathlib import Path
+from typing import Dict, Any, List
+from ebooklib import epub
+
+# Импортируем функции из app.py
+from app import parse_epub_content, calculate_chapter_tokens
+
+
+def sanitize_filename(title: str, max_length: int = 100) -> str:
+    """Очищает название книги для использования в имени файла.
+    
+    Удаляет недопустимые символы и ограничивает длину имени файла.
+    
+    Args:
+        title: Название книги для очистки.
+        max_length: Максимальная длина имени файла.
+        
+    Returns:
+        Безопасное имя файла.
+    """
+    if not title:
+        return 'book_statistics'
+    
+    # Удаляем недопустимые символы для имени файла
+    # Оставляем буквы, цифры, пробелы, дефисы, подчеркивания
+    sanitized = re.sub(r'[<>:"/\\|?*]', '', title)
+    
+    # Заменяем множественные пробелы на один
+    sanitized = re.sub(r'\s+', ' ', sanitized)
+    
+    # Убираем пробелы в начале и конце
+    sanitized = sanitized.strip()
+    
+    # Ограничиваем длину
+    if len(sanitized) > max_length:
+        sanitized = sanitized[:max_length].rstrip()
+    
+    # Если после очистки ничего не осталось, используем дефолтное имя
+    if not sanitized:
+        sanitized = 'book_statistics'
+    
+    return sanitized
+
+
+def format_statistics(title: str, author: str, chapters: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Форматирует статистику по книге и главам.
+    
+    Args:
+        title: Название книги.
+        author: Автор книги.
+        chapters: Список глав с данными.
+        
+    Returns:
+        Словарь со статистикой в формате JSON.
+    """
+    # Статистика по главам
+    chapters_stats: List[Dict[str, Any]] = []
+    total_chars = 0
+    total_tokens = 0
+    
+    for chapter in chapters:
+        chapter_text = chapter.get('text', '')
+        chapter_title = chapter.get('chapterTitle', 'Глава без названия')
+        chapter_number = chapter.get('chapterNumber', 0)
+        footnotes_count = chapter.get('footnotesCount', 0)
+        token_count = chapter.get('tokenCount', 0)
+        
+        # Подсчитываем символы (включая пробелы)
+        char_count = len(chapter_text)
+        
+        # Если tokenCount не был подсчитан, подсчитываем сейчас
+        if token_count == 0 and chapter_text:
+            chapter_data = {
+                'text': chapter_text,
+                'chapterTitle': chapter_title,
+                'chapterId': chapter.get('chapterId', ''),
+                'chapterNumber': chapter_number,
+                'filePath': chapter.get('filePath', ''),
+                'footnotes': chapter.get('footnotes', [])
+            }
+            token_count = calculate_chapter_tokens(chapter_data)
+        
+        total_chars += char_count
+        total_tokens += token_count
+        
+        chapters_stats.append({
+            'chapterNumber': chapter_number,
+            'chapterTitle': chapter_title,
+            'characters': char_count,
+            'tokens': token_count,
+            'footnotesCount': footnotes_count,
+            'filePath': chapter.get('filePath', '')
+        })
+    
+    # Общая статистика
+    result: Dict[str, Any] = {
+        'book': {
+            'title': title,
+            'author': author,
+            'totalChapters': len(chapters),
+            'totalCharacters': total_chars,
+            'totalTokens': total_tokens
+        },
+        'chapters': chapters_stats
+    }
+    
+    return result
+
+
+def main():
+    """Основная функция скрипта."""
+    parser = argparse.ArgumentParser(
+        description='Получение статистики по главам EPUB книги',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Примеры использования:
+  python statistics.py book.epub
+  python statistics.py /path/to/book.epub --output stats.json
+  python statistics.py book.epub --stdout
+        """
+    )
+    parser.add_argument(
+        'epub_file',
+        type=str,
+        help='Путь к EPUB файлу'
+    )
+    parser.add_argument(
+        '-o', '--output',
+        type=str,
+        default=None,
+        help='Путь к выходному JSON файлу (если не указан, создается файл с названием книги)'
+    )
+    parser.add_argument(
+        '--stdout',
+        action='store_true',
+        help='Вывести результат в stdout вместо сохранения в файл'
+    )
+    parser.add_argument(
+        '--compact',
+        action='store_true',
+        help='Сохранить JSON в компактном формате (без отступов)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Проверяем существование файла
+    epub_path = Path(args.epub_file)
+    if not epub_path.exists():
+        print(f"Ошибка: Файл '{epub_path}' не найден.", file=sys.stderr)
+        sys.exit(1)
+    
+    if not epub_path.is_file():
+        print(f"Ошибка: '{epub_path}' не является файлом.", file=sys.stderr)
+        sys.exit(1)
+    
+    try:
+        # Читаем и парсим EPUB
+        book = epub.read_epub(str(epub_path))
+        title, author, metadata, chapters = parse_epub_content(book)
+        
+        # Формируем статистику
+        statistics = format_statistics(title, author, chapters)
+        
+        # Выводим результат
+        if args.compact:
+            json_output = json.dumps(statistics, ensure_ascii=False)
+        else:
+            json_output = json.dumps(statistics, ensure_ascii=False, indent=2)
+        
+        # Определяем путь к выходному файлу
+        if args.stdout:
+            # Выводим в stdout
+            print(json_output)
+        else:
+            if args.output:
+                # Используем указанный путь
+                output_path = Path(args.output)
+            else:
+                # Создаем имя файла на основе названия книги
+                safe_title = sanitize_filename(title)
+                output_path = Path(f"{safe_title}_statistics.json")
+            
+            # Сохраняем в файл
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(json_output)
+            print(f"Статистика сохранена в файл: {output_path}")
+    
+    except Exception as e:
+        print(f"Ошибка при обработке EPUB файла: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()