tech/epub-parser/statistics.py

#!/usr/bin/env python3
"""
Скрипт для получения статистики по главам EPUB книги.

Использование:
    python statistics.py <путь_к_epub_файлу>
    python statistics.py book.epub
"""
import sys
import json
import argparse
import re
from pathlib import Path
from typing import Dict, Any, List
from ebooklib import epub

# Импортируем функции из app.py
from app import parse_epub_content, calculate_chapter_tokens


def sanitize_filename(title: str, max_length: int = 100) -> str:
    """Очищает название книги для использования в имени файла.

    Удаляет недопустимые символы и ограничивает длину имени файла.

    Args:
        title: Название книги для очистки.
        max_length: Максимальная длина имени файла.

    Returns:
        Безопасное имя файла.
    """
    if not title:
        return 'book_statistics'

    # Удаляем недопустимые символы для имени файла
    # Оставляем буквы, цифры, пробелы, дефисы, подчеркивания
    sanitized = re.sub(r'[<>:"/\\|?*]', '', title)

    # Заменяем множественные пробелы на один
    sanitized = re.sub(r'\s+', ' ', sanitized)

    # Убираем пробелы в начале и конце
    sanitized = sanitized.strip()

    # Ограничиваем длину
    if len(sanitized) > max_length:
        sanitized = sanitized[:max_length].rstrip()

    # Если после очистки ничего не осталось, используем дефолтное имя
    if not sanitized:
        sanitized = 'book_statistics'

    return sanitized


def format_statistics(title: str, author: str, chapters: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Форматирует статистику по книге и главам.

    Args:
        title: Название книги.
        author: Автор книги.
        chapters: Список глав с данными.

    Returns:
        Словарь со статистикой в формате JSON.
    """
    # Статистика по главам
    chapters_stats: List[Dict[str, Any]] = []
    total_chars = 0
    total_tokens = 0

    for chapter in chapters:
        chapter_text = chapter.get('text', '')
        chapter_title = chapter.get('chapterTitle', 'Глава без названия')
        chapter_number = chapter.get('chapterNumber', 0)
        footnotes_count = chapter.get('footnotesCount', 0)
        token_count = chapter.get('tokenCount', 0)

        # Подсчитываем символы (включая пробелы)
        char_count = len(chapter_text)

        # Если tokenCount не был подсчитан, подсчитываем сейчас
        if token_count == 0 and chapter_text:
            chapter_data = {
                'text': chapter_text,
                'chapterTitle': chapter_title,
                'chapterId': chapter.get('chapterId', ''),
                'chapterNumber': chapter_number,
                'filePath': chapter.get('filePath', ''),
                'footnotes': chapter.get('footnotes', [])
            }
            token_count = calculate_chapter_tokens(chapter_data)

        total_chars += char_count
        total_tokens += token_count

        chapters_stats.append({
            'chapterNumber': chapter_number,
            'chapterTitle': chapter_title,
            'characters': char_count,
            'tokens': token_count,
            'footnotesCount': footnotes_count,
            'filePath': chapter.get('filePath', '')
        })

    # Общая статистика
    result: Dict[str, Any] = {
        'book': {
            'title': title,
            'author': author,
            'totalChapters': len(chapters),
            'totalCharacters': total_chars,
            'totalTokens': total_tokens
        },
        'chapters': chapters_stats
    }

    return result


def main():
    """Основная функция скрипта."""
    parser = argparse.ArgumentParser(
        description='Получение статистики по главам EPUB книги',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Примеры использования:
  python statistics.py book.epub
  python statistics.py /path/to/book.epub --output stats.json
  python statistics.py book.epub --stdout
        """
    )
    parser.add_argument(
        'epub_file',
        type=str,
        help='Путь к EPUB файлу'
    )
    parser.add_argument(
        '-o', '--output',
        type=str,
        default=None,
        help='Путь к выходному JSON файлу (если не указан, создается файл с названием книги)'
    )
    parser.add_argument(
        '--stdout',
        action='store_true',
        help='Вывести результат в stdout вместо сохранения в файл'
    )
    parser.add_argument(
        '--compact',
        action='store_true',
        help='Сохранить JSON в компактном формате (без отступов)'
    )

    args = parser.parse_args()

    # Проверяем существование файла
    epub_path = Path(args.epub_file)
    if not epub_path.exists():
        print(f"Ошибка: Файл '{epub_path}' не найден.", file=sys.stderr)
        sys.exit(1)

    if not epub_path.is_file():
        print(f"Ошибка: '{epub_path}' не является файлом.", file=sys.stderr)
        sys.exit(1)

    try:
        # Читаем и парсим EPUB
        book = epub.read_epub(str(epub_path))
        title, author, metadata, chapters = parse_epub_content(book)

        # Формируем статистику
        statistics = format_statistics(title, author, chapters)

        # Выводим результат
        if args.compact:
            json_output = json.dumps(statistics, ensure_ascii=False)
        else:
            json_output = json.dumps(statistics, ensure_ascii=False, indent=2)

        # Определяем путь к выходному файлу
        if args.stdout:
            # Выводим в stdout
            print(json_output)
        else:
            if args.output:
                # Используем указанный путь
                output_path = Path(args.output)
            else:
                # Создаем имя файла на основе названия книги
                safe_title = sanitize_filename(title)
                output_path = Path(f"{safe_title}_statistics.json")

            # Сохраняем в файл
            output_path.parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(json_output)
            print(f"Статистика сохранена в файл: {output_path}")

    except Exception as e:
        print(f"Ошибка при обработке EPUB файла: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == '__main__':
    main()