tech/2_валидация_анализа_по_блокам/run_validation_ollama.py

#!/usr/bin/env python3
"""
Валидация анализа главы по блокам через Ollama: framework → insights → application → limitations.
Вход: merge.json (анализ из этапа 1), вход_главы.json (текст главы и метаданные).
Выход: один JSON-файл со статусами по каждому блоку (verdict, score, hallucinations, missing_key_points).
"""

import argparse
import json
import re
import sys
import time
import urllib.request
from pathlib import Path

OLLAMA_URL = "http://localhost:11434"
MODEL = "qwen3:14b"
DIR = Path(__file__).resolve().parent
DEFAULT_MERGE = DIR.parent / "1_анализ_главы" / "merge.json"
DEFAULT_CHAPTER = DIR.parent / "1_анализ_главы" / "вход_главы.json"

OLLAMA_OPTIONS = {
    "temperature": 0.2,
    "num_ctx": 8500,
    "num_predict": 2048,
    "repeat_penalty": 1.1,
}

BLOCKS = [
    ("framework", "validate_framework.txt", "framework_json"),
    ("insights", "validate_insights.txt", "insights_json"),
    ("application", "validate_application.txt", "application_json"),
    ("limitations", "validate_limitations.txt", "limitations_json"),
]


def load_json(path: Path) -> dict:
    """Загружает JSON из файла."""
    with open(path, encoding="utf-8") as f:
        return json.load(f)


def load_prompt(filename: str) -> str:
    """Загружает шаблон промпта из файла."""
    with open(DIR / filename, encoding="utf-8") as f:
        return f.read()


def substitute_prompt(
    prompt: str,
    book_title: str,
    chapter_title: str,
    chapter_text: str,
    block_json: str,
    block_placeholder: str,
) -> str:
    """Подставляет в промпт поля главы и JSON блока. block_placeholder — например {framework_json}."""
    return (
        prompt.replace("{book_title}", book_title)
        .replace("{chapter_title}", chapter_title)
        .replace("{chapter_text}", chapter_text)
        .replace(block_placeholder, block_json)
    )


def extract_json_from_response(text: str) -> dict:
    """Достаёт JSON из ответа модели (может быть обёрнут в ```json ... ```)."""
    text = text.strip()
    match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
    if match:
        text = match.group(1).strip()
    return json.loads(text)


def call_ollama(prompt: str) -> str:
    """Вызывает Ollama /api/chat и возвращает content ответа."""
    body = json.dumps(
        {
            "model": MODEL,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False,
            "format": "json",
            "options": OLLAMA_OPTIONS,
            "keep_alive": 0,
        },
        ensure_ascii=False,
    ).encode("utf-8")
    req = urllib.request.Request(
        f"{OLLAMA_URL}/api/chat",
        data=body,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=None) as resp:
            data = json.load(resp)
        return data.get("message", {}).get("content", "")
    except urllib.error.HTTPError as e:
        body_b = b""
        if e.fp:
            try:
                body_b = e.fp.read()[:1000]
            except Exception:
                pass
        raise RuntimeError(
            f"Ollama HTTP {e.code}: {e.reason}. Body: {body_b.decode('utf-8', errors='replace')}"
        ) from e


def main() -> int:
    """Последовательно валидирует 4 блока и пишет один JSON со статусами."""
    parser = argparse.ArgumentParser(
        description="Валидация анализа главы по блокам через Ollama. Выход — JSON со статусами."
    )
    parser.add_argument(
        "--merge",
        type=Path,
        default=DEFAULT_MERGE,
        help=f"Путь к merge.json (по умолчанию: {DEFAULT_MERGE})",
    )
    parser.add_argument(
        "--chapter",
        type=Path,
        default=DEFAULT_CHAPTER,
        help=f"Путь к вход_главы.json (по умолчанию: {DEFAULT_CHAPTER})",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=Path,
        default=DIR / "validation_status.json",
        help="Путь к выходному JSON со статусами (по умолчанию: validation_status.json)",
    )
    args = parser.parse_args()

    if not args.merge.is_file():
        print(f"Файл не найден: {args.merge}", file=sys.stderr)
        return 1
    if not args.chapter.is_file():
        print(f"Файл не найден: {args.chapter}", file=sys.stderr)
        return 1

    print("Загрузка merge.json и вход_главы.json...")
    merge = load_json(args.merge)
    chapter = load_json(args.chapter)
    book_title = chapter.get("book_title", "")
    chapter_title = chapter.get("chapter_title", "")
    chapter_text = chapter.get("chapter_text", "")

    results: dict = {}

    for block_name, prompt_file, json_placeholder in BLOCKS:
        block_data = merge.get(block_name)
        if block_data is None:
            print(f"Блок «{block_name}» отсутствует в merge.json, пропуск.", file=sys.stderr)
            results[block_name] = {
                "verdict": "skipped",
                "score": None,
                "hallucinations": [],
                "missing_key_points": [],
                "error": "block not found in merge",
            }
            continue

        prompt_tpl = load_prompt(prompt_file)
        block_json_str = json.dumps(block_data, ensure_ascii=False, indent=2)
        placeholder = "{" + json_placeholder + "}"  # {framework_json}, {insights_json}, ...
        prompt = substitute_prompt(
            prompt_tpl,
            book_title,
            chapter_title,
            chapter_text,
            block_json_str,
            placeholder,
        )

        print(f"Валидация блока «{block_name}»...")
        t0 = time.monotonic()
        try:
            raw = call_ollama(prompt)
        except Exception as e:
            print(f"Ошибка вызова Ollama на блоке «{block_name}»: {e}", file=sys.stderr)
            results[block_name] = {
                "verdict": "error",
                "score": None,
                "hallucinations": [],
                "missing_key_points": [],
                "error": str(e),
            }
            continue
        elapsed = time.monotonic() - t0
        print(f"  Ответ за {elapsed:.1f} сек ({elapsed / 60:.1f} мин)")

        try:
            block_result = extract_json_from_response(raw)
        except json.JSONDecodeError as e:
            print(
                f"Не удалось распарсить JSON в блоке «{block_name}»: {e}",
                file=sys.stderr,
            )
            print("Первые 500 символов ответа:", raw[:500], file=sys.stderr)
            results[block_name] = {
                "verdict": "error",
                "score": None,
                "hallucinations": [],
                "missing_key_points": [],
                "error": f"JSON decode: {e}",
            }
            continue

        if not isinstance(block_result, dict):
            results[block_name] = {
                "verdict": "error",
                "score": None,
                "hallucinations": [],
                "missing_key_points": [],
                "error": f"expected dict, got {type(block_result).__name__}",
            }
            continue

        results[block_name] = {
            "verdict": block_result.get("verdict", "unknown"),
            "score": block_result.get("score"),
            "hallucinations": block_result.get("hallucinations", []),
            "missing_key_points": block_result.get("missing_key_points", []),
        }

    args.output.parent.mkdir(parents=True, exist_ok=True)
    with open(args.output, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Записано: {args.output}")
    return 0


if __name__ == "__main__":
    sys.exit(main())