Files
tech/2_валидация_анализа_по_блокам/run_validation_ollama.py
2026-02-01 22:02:49 +03:00

237 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Валидация анализа главы по блокам через Ollama: framework → insights → application → limitations.
Вход: merge.json (анализ из этапа 1), вход_главы.json (текст главы и метаданные).
Выход: один JSON-файл со статусами по каждому блоку (verdict, score, hallucinations, missing_key_points).
"""
import argparse
import json
import re
import sys
import time
import urllib.request
from pathlib import Path
OLLAMA_URL = "http://localhost:11434"
MODEL = "qwen3:14b"
DIR = Path(__file__).resolve().parent
DEFAULT_MERGE = DIR.parent / "1_анализ_главы" / "merge.json"
DEFAULT_CHAPTER = DIR.parent / "1_анализ_главы" / "вход_главы.json"
OLLAMA_OPTIONS = {
"temperature": 0.2,
"num_ctx": 8500,
"num_predict": 2048,
"repeat_penalty": 1.1,
}
BLOCKS = [
("framework", "validate_framework.txt", "framework_json"),
("insights", "validate_insights.txt", "insights_json"),
("application", "validate_application.txt", "application_json"),
("limitations", "validate_limitations.txt", "limitations_json"),
]
def load_json(path: Path) -> dict:
"""Загружает JSON из файла."""
with open(path, encoding="utf-8") as f:
return json.load(f)
def load_prompt(filename: str) -> str:
"""Загружает шаблон промпта из файла."""
with open(DIR / filename, encoding="utf-8") as f:
return f.read()
def substitute_prompt(
prompt: str,
book_title: str,
chapter_title: str,
chapter_text: str,
block_json: str,
block_placeholder: str,
) -> str:
"""Подставляет в промпт поля главы и JSON блока. block_placeholder — например {framework_json}."""
return (
prompt.replace("{book_title}", book_title)
.replace("{chapter_title}", chapter_title)
.replace("{chapter_text}", chapter_text)
.replace(block_placeholder, block_json)
)
def extract_json_from_response(text: str) -> dict:
"""Достаёт JSON из ответа модели (может быть обёрнут в ```json ... ```)."""
text = text.strip()
match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
if match:
text = match.group(1).strip()
return json.loads(text)
def call_ollama(prompt: str) -> str:
"""Вызывает Ollama /api/chat и возвращает content ответа."""
body = json.dumps(
{
"model": MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"format": "json",
"options": OLLAMA_OPTIONS,
"keep_alive": 0,
},
ensure_ascii=False,
).encode("utf-8")
req = urllib.request.Request(
f"{OLLAMA_URL}/api/chat",
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=None) as resp:
data = json.load(resp)
return data.get("message", {}).get("content", "")
except urllib.error.HTTPError as e:
body_b = b""
if e.fp:
try:
body_b = e.fp.read()[:1000]
except Exception:
pass
raise RuntimeError(
f"Ollama HTTP {e.code}: {e.reason}. Body: {body_b.decode('utf-8', errors='replace')}"
) from e
def main() -> int:
"""Последовательно валидирует 4 блока и пишет один JSON со статусами."""
parser = argparse.ArgumentParser(
description="Валидация анализа главы по блокам через Ollama. Выход — JSON со статусами."
)
parser.add_argument(
"--merge",
type=Path,
default=DEFAULT_MERGE,
help=f"Путь к merge.json (по умолчанию: {DEFAULT_MERGE})",
)
parser.add_argument(
"--chapter",
type=Path,
default=DEFAULT_CHAPTER,
help=f"Путь к вход_главы.json (по умолчанию: {DEFAULT_CHAPTER})",
)
parser.add_argument(
"-o",
"--output",
type=Path,
default=DIR / "validation_status.json",
help="Путь к выходному JSON со статусами (по умолчанию: validation_status.json)",
)
args = parser.parse_args()
if not args.merge.is_file():
print(f"Файл не найден: {args.merge}", file=sys.stderr)
return 1
if not args.chapter.is_file():
print(f"Файл не найден: {args.chapter}", file=sys.stderr)
return 1
print("Загрузка merge.json и вход_главы.json...")
merge = load_json(args.merge)
chapter = load_json(args.chapter)
book_title = chapter.get("book_title", "")
chapter_title = chapter.get("chapter_title", "")
chapter_text = chapter.get("chapter_text", "")
results: dict = {}
for block_name, prompt_file, json_placeholder in BLOCKS:
block_data = merge.get(block_name)
if block_data is None:
print(f"Блок «{block_name}» отсутствует в merge.json, пропуск.", file=sys.stderr)
results[block_name] = {
"verdict": "skipped",
"score": None,
"hallucinations": [],
"missing_key_points": [],
"error": "block not found in merge",
}
continue
prompt_tpl = load_prompt(prompt_file)
block_json_str = json.dumps(block_data, ensure_ascii=False, indent=2)
placeholder = "{" + json_placeholder + "}" # {framework_json}, {insights_json}, ...
prompt = substitute_prompt(
prompt_tpl,
book_title,
chapter_title,
chapter_text,
block_json_str,
placeholder,
)
print(f"Валидация блока «{block_name}»...")
t0 = time.monotonic()
try:
raw = call_ollama(prompt)
except Exception as e:
print(f"Ошибка вызова Ollama на блоке «{block_name}»: {e}", file=sys.stderr)
results[block_name] = {
"verdict": "error",
"score": None,
"hallucinations": [],
"missing_key_points": [],
"error": str(e),
}
continue
elapsed = time.monotonic() - t0
print(f" Ответ за {elapsed:.1f} сек ({elapsed / 60:.1f} мин)")
try:
block_result = extract_json_from_response(raw)
except json.JSONDecodeError as e:
print(
f"Не удалось распарсить JSON в блоке «{block_name}»: {e}",
file=sys.stderr,
)
print("Первые 500 символов ответа:", raw[:500], file=sys.stderr)
results[block_name] = {
"verdict": "error",
"score": None,
"hallucinations": [],
"missing_key_points": [],
"error": f"JSON decode: {e}",
}
continue
if not isinstance(block_result, dict):
results[block_name] = {
"verdict": "error",
"score": None,
"hallucinations": [],
"missing_key_points": [],
"error": f"expected dict, got {type(block_result).__name__}",
}
continue
results[block_name] = {
"verdict": block_result.get("verdict", "unknown"),
"score": block_result.get("score"),
"hallucinations": block_result.get("hallucinations", []),
"missing_key_points": block_result.get("missing_key_points", []),
}
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"Записано: {args.output}")
return 0
if __name__ == "__main__":
sys.exit(main())