commit 711053b8bd18fec1aa3650dc80798bc9ed095fc1 Author: Sabo Sabev Date: Wed May 20 11:52:11 2026 +0300 Initial commit: working RIP/INEX_TM help processing pipeline - help_processor.py: parses .docx/.html/.pdf/.doc/.txt, extracts images, classifies sections via Claude API, writes to SQL Server - generate_html.py: builds interactive HTML viewer (Home/Editor/Search/Generator) - save_keywords.py: applies keyword edits back to DB - Prefix-scoped DB schema (RIP_help_files, RIP_help_sections) so multiple projects share the same database without collision - BAT launchers per project (RIP_load.bat, INEX_TM_load.bat, ...) load credentials from gitignored .env via _load_env.bat - Rich HTML preservation for .html sources (html_text column) - Image extraction for all formats with MS Word / LibreOffice fallback for .doc Co-Authored-By: Claude Opus 4.7 (1M context) diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f401d7c --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +REM Copy to .env and fill in real values. .env is gitignored. +REM Loaded by .bat файловете чрез: for /f "delims=" %%a in (.env) do set "%%a" + +ANTHROPIC_API_KEY=sk-ant-api03-XXXXXXXXXXXXXXXXXXXXXXXXX +HELP_DB_CONN=DRIVER={ODBC Driver 18 for SQL Server};TrustServerCertificate=yes;SERVER=host,port;DATABASE=db;UID=user;PWD=password diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d8e5472 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +# Credentials +.env +.env.local +*.local + +# Python +__pycache__/ +*.pyc +*.pyo + +# Logs +*.log + +# Generated outputs +help_viewer.html +keywords_changes*.json + +# Output processing folders (на отделен диск, не за git) +Output/ +output/ + +# Archives +*.zip +*.tar.gz + +# IDE / tools +.vscode/ +.idea/ +.claude/ +*.swp diff --git a/Bairaci.png b/Bairaci.png new file mode 100644 index 0000000..28d1ce6 Binary files /dev/null and b/Bairaci.png differ diff --git a/INEX_TM_load.bat b/INEX_TM_load.bat new file mode 100644 index 0000000..bf979c3 --- /dev/null +++ b/INEX_TM_load.bat @@ -0,0 +1,9 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +echo === INCREMENTAL prefix=INEX_TM === +echo. +python help_processor.py --prefix=INEX_TM "q:\___Proekti\2022 INEX Технологична модернизация" "q:\___Proekti\2022 INEX Технологична модернизация\Output" +pause diff --git a/INEX_TM_load_force.bat b/INEX_TM_load_force.bat new file mode 100644 index 0000000..883c272 --- /dev/null +++ b/INEX_TM_load_force.bat @@ -0,0 +1,9 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +echo === FORCE + PURGE prefix=INEX_TM === +echo. +python help_processor.py --prefix=INEX_TM --force --purge-missing "q:\___Proekti\2022 INEX Технологична модернизация" "q:\___Proekti\2022 INEX Технологична модернизация\Output" +pause diff --git a/INEX_TM_view.bat b/INEX_TM_view.bat new file mode 100644 index 0000000..874032d --- /dev/null +++ b/INEX_TM_view.bat @@ -0,0 +1,6 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +python generate_html.py --prefix=INEX_TM diff --git a/README.md b/README.md new file mode 100644 index 0000000..63a0f95 --- /dev/null +++ b/README.md @@ -0,0 +1,124 @@ +# RIP Help System — Help-файл декомпозитор и viewer + +Обработва help-файлове (`.html`, `.htm`, `.docx`, `.doc`, `.pdf`, `.txt`), декомпозира ги на секции, извлича картинки, класифицира секциите с Claude API (заглавие + ключови думи), и записва всичко в SQL Server. После генерира интерактивен HTML viewer. + +## Архитектура + +``` +Входни файлове → help_processor.py → SQL Server → generate_html.py → help_viewer.html +(.docx, .html, (RIP_help_*) (Home / Редактор / + .pdf, .doc) Търсене / Генератор) + ↑ + │ + save_keywords.py ← keywords_changes.json + (от Редактора на viewer-а) +``` + +## Инсталация + +``` +pip install -r requirements.txt +``` + +За стар `.doc` формат — едно от: +- **LibreOffice** в PATH (кросплатформено) +- **MS Word** (Windows, чрез pywin32 COM — автоматичен fallback) + +## Конфигурация + +Копирай `.env.example` като `.env` и попълни: + +``` +ANTHROPIC_API_KEY=sk-ant-... +HELP_DB_CONN=DRIVER={ODBC Driver 18 for SQL Server};TrustServerCertificate=yes;SERVER=host,port;DATABASE=db;UID=user;PWD=password +``` + +`.env` е gitignore-нат. Bat файловете го зареждат автоматично през `_load_env.bat`. + +## Употреба (Windows) + +### Обработка на нов проект + +Първо създай `_load.bat` и `_view.bat` (вж. `RIP_load.bat`, `RIP_view.bat` като образец). + +| BAT | Какво прави | +|---|---| +| `RIP_load.bat` | Incremental — обработва само нови/променени файлове по SHA-256 hash | +| `RIP_load_force.bat` | `--force --purge-missing` — преобработва всичко, изтрива orphans | +| `RIP_view.bat` | Генерира `help_viewer.html` за prefix=RIP и го отваря в браузъра | + +### Директно от CLI + +``` +python help_processor.py --prefix= +python help_processor.py --prefix= --force --purge-missing + +python generate_html.py --prefix= # без Home таб +python generate_html.py --prefix= --home img.png # с Home таб +``` + +## Prefix scoping + +Всеки проект има свой `--prefix` (напр. `RIP`, `INEX_TM`). Прави следните неща изолирани между проектите: + +- Кодовете на секциите: `RIP_0001_SEC_0001` vs `INEX_TM_0001_SEC_0001` +- skip-by-hash (incremental) — само в рамките на prefix-а +- `--purge-missing` — изтрива orphans само в текущия prefix +- `generate_html.py --prefix=X` — viewer-а филтрира по prefix + +## Структура на базата + +### `RIP_help_files` +| Поле | Тип | Описание | +|---|---|---| +| id | INT IDENTITY | PK | +| prefix | NVARCHAR(50) | Project scope | +| file_path | NVARCHAR(1000) | Пълен път до файла | +| file_hash | CHAR(64) | SHA-256 за incremental | +| processed_at | DATETIME2 | Последна обработка | +| section_count | INT | Брой секции | + +UNIQUE constraint: `(prefix, file_path)` + +### `RIP_help_sections` +| Поле | Тип | Описание | +|---|---|---| +| id | INT IDENTITY | PK | +| prefix | NVARCHAR(50) | Project scope | +| code | NVARCHAR(80) | `_NNNN_SEC_NNNN` (UNIQUE) | +| source_file | NVARCHAR(1000) | Източник | +| title | NVARCHAR(500) | AI-генерирано заглавие | +| keywords | NVARCHAR(300) | До 5 ключови думи | +| char_count | INT | Размер на чистия текст | +| output_path | NVARCHAR(1000) | Път до `.txt` файла | +| images | NVARCHAR(MAX) | JSON масив с относителни пътища | +| html_text | NVARCHAR(MAX) | Rich HTML с форматиране (само за `.html` източници) | +| created_at, updated_at | DATETIME2 | | + +## HTML Viewer — 3 / 4 таба + +- **Home** (опционален, ако `--home ` е подаден) — началов екран с изображение +- **Редактор** — таблица със секции; inline редактиране на ключови думи; ✓ Save → JSON download → `save_keywords.py` → UPDATE в БД +- **Търсене** — карти със секции; multi-keyword (intervals = AND, "phrase" = literal); preview с картинки +- **Генератор** — drag & drop ordering → export като HTML (self-contained, всички картинки base64-embed-нати) + +## Картинки + +Извличат се по време на парсване: +- `.docx` — `` в paragraph drawings → bytes от related_parts +- `.html` — локални файлове и `data:` URLs; HTTP пропуска +- `.pdf` — `pdfplumber.page.crop(bbox).to_image()` като PNG +- `.doc` — след LibreOffice/MS Word конверсия до `.docx` + +Филтър ≥ 50×50 px (PIL детектва), за да отрязва иконки/булети. + +Записват се в `/images/_img_NN.`. В текста placeholder `[IMG: images/...]`. В DB `images` колоната съдържа JSON масив с пътищата. + +## Constants (в `help_processor.py`) + +| Константа | Default | Описание | +|---|---|---| +| `MIN_SECTION_TOKENS` | 60 | Под този праг секцията се слива с предишната | +| `MAX_AI_CHARS` | 4000 | Символи, пращани към Claude | +| `AI_MODEL` | claude-sonnet-4-6 | Модел за класификация | +| `MIN_IMAGE_PX` | 50 | Картинки под NxN px се пропускат | diff --git a/RIP_load.bat b/RIP_load.bat new file mode 100644 index 0000000..b825420 --- /dev/null +++ b/RIP_load.bat @@ -0,0 +1,9 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +echo === INCREMENTAL prefix=RIP === +echo. +python help_processor.py --prefix=RIP "q:\RIP_Help_Source" "q:\RIP_Help_Source\Output" +pause diff --git a/RIP_load_force.bat b/RIP_load_force.bat new file mode 100644 index 0000000..8400e94 --- /dev/null +++ b/RIP_load_force.bat @@ -0,0 +1,9 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +echo === FORCE + PURGE prefix=RIP === +echo. +python help_processor.py --prefix=RIP --force --purge-missing "q:\RIP_Help_Source" "q:\RIP_Help_Source\Output" +pause diff --git a/RIP_view.bat b/RIP_view.bat new file mode 100644 index 0000000..6391d55 --- /dev/null +++ b/RIP_view.bat @@ -0,0 +1,6 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +python generate_html.py --prefix=RIP --home Bairaci.png diff --git a/_load_env.bat b/_load_env.bat new file mode 100644 index 0000000..69ccaa7 --- /dev/null +++ b/_load_env.bat @@ -0,0 +1,9 @@ +@echo off +REM Зарежда ANTHROPIC_API_KEY и HELP_DB_CONN от .env в текущата cmd среда. +REM Извиква се с: call _load_env.bat +if not exist .env ( + echo [ERROR] Липсва .env файл. Копирай .env.example като .env и попълни. + exit /b 1 +) +for /f "usebackq tokens=1,* delims== eol=#" %%A in (".env") do set "%%A=%%B" +exit /b 0 diff --git a/generate_html.py b/generate_html.py new file mode 100644 index 0000000..151c54e --- /dev/null +++ b/generate_html.py @@ -0,0 +1,938 @@ +""" +generate_html.py +================ +Чете секциите от SQL Server и генерира help_viewer.html. +Стартирай с: python generate_html.py +""" + +import os, sys, json, re, base64, mimetypes, argparse +from pathlib import Path +from datetime import datetime +from typing import Optional + +try: + import pyodbc +except ImportError: + sys.exit("Инсталирай pyodbc: pip install pyodbc") + +CONN_STR = os.getenv( + "HELP_DB_CONN", + "DRIVER={ODBC Driver 18 for SQL Server};" + "TrustServerCertificate=yes;" + "SERVER=94.26.63.238,13151;DATABASE=blondina;" + "UID=blondina_login;PWD=blondina_parola_123" +) +OUT_HTML = Path(__file__).parent / "help_viewer.html" + + +_IMG_PLACEHOLDER_RE = re.compile(r"\[IMG:\s*([^\]]+?)\s*\]") + + +def _esc(s: str) -> str: + return (s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """)) + + +def _img_src(rel: str, output_dir: Path, embed: bool) -> str: + """file:// URI или base64 data URI за картинка.""" + abs_path = (output_dir / rel).resolve() + if not abs_path.exists(): + return _esc(rel) + if embed: + try: + mime = mimetypes.guess_type(str(abs_path))[0] or "image/png" + b64 = base64.b64encode(abs_path.read_bytes()).decode("ascii") + return f"data:{mime};base64,{b64}" + except Exception: + return abs_path.as_uri() + return abs_path.as_uri() + + +def _text_to_html(text: str, output_dir: Path, embed: bool = False) -> str: + """Конвертира [IMG: images/foo.png] към ; escape-ва останалия текст.""" + parts = [] + last = 0 + for m in _IMG_PLACEHOLDER_RE.finditer(text): + parts.append(_esc(text[last:m.start()])) + rel = m.group(1).strip().replace("\\", "/") + src = _img_src(rel, output_dir, embed) + parts.append( + f'' + ) + last = m.end() + parts.append(_esc(text[last:])) + return "".join(parts).replace("\n", "
") + + +def _rich_html_with_images(html: str, output_dir: Path, embed: bool = False) -> str: + """Същото като _text_to_html, но входът е вече HTML — НЕ escape-ва.""" + def sub(m): + rel = m.group(1).strip().replace("\\", "/") + src = _img_src(rel, output_dir, embed) + return (f'') + return _IMG_PLACEHOLDER_RE.sub(sub, html) + + +def fetch_sections(prefix: Optional[str] = None): + conn = pyodbc.connect(CONN_STR, autocommit=True) + cur = conn.cursor() + if prefix: + cur.execute(""" + SELECT s.prefix, s.code, s.title, s.keywords, s.char_count, + s.source_file, s.output_path, s.updated_at, + s.images, s.html_text, f.section_count + FROM RIP_help_sections s + LEFT JOIN RIP_help_files f + ON f.file_path = s.source_file AND f.prefix = s.prefix + WHERE s.prefix = ? + ORDER BY s.code + """, prefix) + else: + cur.execute(""" + SELECT s.prefix, s.code, s.title, s.keywords, s.char_count, + s.source_file, s.output_path, s.updated_at, + s.images, s.html_text, f.section_count + FROM RIP_help_sections s + LEFT JOIN RIP_help_files f + ON f.file_path = s.source_file AND f.prefix = s.prefix + ORDER BY s.prefix, s.code + """) + cols = [c[0] for c in cur.description] + rows = [] + for r in cur.fetchall(): + d = dict(zip(cols, r)) + d["updated_at"] = str(d["updated_at"])[:16] if d["updated_at"] else "" + # парсваме images JSON + try: + d["images"] = json.loads(d["images"]) if d.get("images") else [] + except Exception: + d["images"] = [] + # прочитаме текста от .txt файла ако съществува + d["text"] = "" + d["text_html"] = "" # file:// — за viewer-а + d["text_html_embed"] = "" # base64 data: — за export (self-contained) + out_dir = Path(d["output_path"]).parent if d.get("output_path") else None + if d.get("output_path") and Path(d["output_path"]).exists(): + try: + txt_path = Path(d["output_path"]) + raw = txt_path.read_text(encoding="utf-8") + parts = raw.split("─" * 60, 1) + body = parts[1].strip() if len(parts) > 1 else raw + d["text"] = body[:800] + except Exception: + pass + # rich HTML от БД има приоритет; иначе fallback към plain text + if d.get("html_text") and out_dir: + d["text_html"] = _rich_html_with_images(d["html_text"], out_dir, embed=False) + d["text_html_embed"] = _rich_html_with_images(d["html_text"], out_dir, embed=True) + elif out_dir and d["text"]: + d["text_html"] = _text_to_html(d["text"][:1200], out_dir, embed=False) + d["text_html_embed"] = _text_to_html(d["text"], out_dir, embed=True) + rows.append(d) + conn.close() + return rows + + +def _home_image_data_uri(home_path: Optional[str]) -> Optional[str]: + """Връща data: URI ако файлът съществува, иначе None.""" + if not home_path: + return None + p = Path(home_path).expanduser() + if not p.is_absolute(): + p = (Path(__file__).parent / p).resolve() + if not p.is_file(): + print(f" [home] файлът не е намерен: {p}", file=sys.stderr) + return None + mime = mimetypes.guess_type(str(p))[0] or "image/png" + b64 = base64.b64encode(p.read_bytes()).decode("ascii") + return f"data:{mime};base64,{b64}" + + +def build_html(sections, home_image: Optional[str] = None): + data_json = json.dumps(sections, ensure_ascii=False) + generated = datetime.now().strftime("%d.%m.%Y %H:%M") + home_uri = _home_image_data_uri(home_image) + + if home_uri: + home_tab_html = '
00 / Home
' + editor_tab_cls = "tab" + editor_panel_cls = "panel" + home_panel_html = ( + '
' + f'
Home
' + '
' + ) + tab_index_list = "['home','editor','search','generator']" + initial_tab = "home" + else: + home_tab_html = "" + editor_tab_cls = "tab active" + editor_panel_cls = "panel active" + home_panel_html = "" + tab_index_list = "['editor','search','generator']" + initial_tab = "editor" + + return f""" + + + + +Help Viewer + + + + +
+

BG16RFPR001-1.001-0068

+ | + + генериран: {generated} +
+ +
+ {home_tab_html} +
01 / Редактор
+
02 / Търсене
+
03 / Генератор
+
+ +{home_panel_html} + + +
+
+ + +
+
+ + + + + + + + + + + +
КодЗаглавиеКлючови думиSource файлОбновен
+
+
+ + + + + +
+
+
+
+ Няма избрани секции + +
+
+
+
+
Избери секции от таб Търсене
+
+
+
+
+

ГЕНЕРИРАЙ ДОКУМЕНТ

+
+ + + +
+
+ Подреди секциите с drag & drop преди генериране. +

+ За Word и PDF е нужен Python backend — засега се генерира HTML. +
+
+
+
+ + +
+ 0 промени + + +
+ +
+ + + +""" + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description="Генерира help_viewer.html от БД") + ap.add_argument( + "--prefix", + default=os.getenv("HELP_PREFIX"), + help="Филтрира viewer-а по prefix (например 'HLP', 'PROJ_X'). " + "Ако липсва, показва всички префикси." + ) + ap.add_argument( + "--out", + default=str(OUT_HTML), + help=f"Изходен HTML път (default: {OUT_HTML.name})." + ) + ap.add_argument( + "--home", + default=None, + help="Път към изображение, което да се покаже като Home таб (пръв). " + "Ако липсва — няма Home таб (трите стандартни таба остават)." + ) + args = ap.parse_args() + + print("Четем от базата данни...") + if args.prefix: + print(f" Филтър по prefix: {args.prefix}") + if args.home: + print(f" Home image: {args.home}") + try: + sections = fetch_sections(prefix=args.prefix) + except Exception as e: + sys.exit(f"Грешка при свързване с БД: {e}") + + print(f"Намерени {len(sections)} секции.") + html = build_html(sections, home_image=args.home) + out_path = Path(args.out) + out_path.write_text(html, encoding="utf-8") + print(f"Генериран: {out_path}") + + import webbrowser + webbrowser.open(out_path.as_uri()) + print("Отворен в браузъра.") diff --git a/help_processor.py b/help_processor.py new file mode 100644 index 0000000..eff3f1d --- /dev/null +++ b/help_processor.py @@ -0,0 +1,1162 @@ +""" +help_processor.py +================= +Обработва help-файлове (.doc, .docx, .html, .htm, .txt, .pdf), +декомпозира ги на смислови секции, извлича ключови думи чрез Anthropic API +и записва резултатите в SQL Server + изходна директория. + +Поддържа инкрементална обработка: файлове, чийто hash не се е променил, +се прескачат при повторно пускане. + +Изисквания (pip install): + pip install anthropic pyodbc python-docx beautifulsoup4 lxml + pip install pdfplumber striprtf chardet + pip install pywin32 # за MS Word fallback на Windows + +За .doc (стар формат) е необходим един от: + - LibreOffice (soffice в PATH) — кросплатформено + - MS Word — Windows, чрез pywin32 COM (автоматичен fallback) + - antiword — Linux (apt install antiword) +""" + +import os +import re +import sys +import json +import hashlib +import logging +import argparse +import subprocess +import tempfile +from pathlib import Path +from datetime import datetime +from dataclasses import dataclass, field +from typing import Optional + +import pyodbc +import anthropic +from docx import Document +from bs4 import BeautifulSoup + +try: + import pdfplumber + HAS_PDF = True +except ImportError: + HAS_PDF = False + +try: + from PIL import Image + HAS_PIL = True +except ImportError: + HAS_PIL = False + +# ────────────────────────────────────────────── +# Конфигурация +# ────────────────────────────────────────────── + +# На Windows конзолата често е cp1251 → пренастройваме stdout на utf-8 +try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + sys.stderr.reconfigure(encoding="utf-8", errors="replace") +except AttributeError: + pass + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-8s %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler("help_processor.log", encoding="utf-8"), + ], +) +log = logging.getLogger(__name__) + +MIN_SECTION_TOKENS = 60 # секции под тази граница се сливат с предишната +MAX_AI_CHARS = 4000 # максимален текст, изпращан към Claude за класификация +AI_MODEL = "claude-sonnet-4-6" +MIN_IMAGE_PX = 50 # картинки под NxN px се пропускат (иконки/булети) + + +# ────────────────────────────────────────────── +# Изображения — помощни +# ────────────────────────────────────────────── + +@dataclass +class ImageRef: + placeholder: str # вътрешен ID в текста, напр. "img_01" + data: bytes + ext: str # "png", "jpg", "gif"... + + +def _img_dimensions(data: bytes) -> Optional[tuple[int, int]]: + if not HAS_PIL: + return None + try: + from io import BytesIO + with Image.open(BytesIO(data)) as im: + return im.size + except Exception: + return None + + +def _should_keep_image(data: bytes) -> bool: + """Връща False за дребни иконки/булети под MIN_IMAGE_PX × MIN_IMAGE_PX.""" + if not data: + return False + dims = _img_dimensions(data) + if dims is None: + # Не можем да преценим — пазим по подразбиране + return True + w, h = dims + return w >= MIN_IMAGE_PX and h >= MIN_IMAGE_PX + + +def _ext_from_content_type(ct: str) -> str: + ct = (ct or "").lower() + if "png" in ct: return "png" + if "jpeg" in ct or "jpg" in ct: return "jpg" + if "gif" in ct: return "gif" + if "bmp" in ct: return "bmp" + if "svg" in ct: return "svg" + if "webp" in ct: return "webp" + return "png" + + +_IMG_PLACEHOLDER_RE = re.compile(r"\[IMG:\s*([A-Za-z0-9_./\\-]+)\s*\]") + + +# ────────────────────────────────────────────── +# Структури +# ────────────────────────────────────────────── + +@dataclass +class Section: + title: str + text: str + level: int = 1 # 1=H1, 2=H2, 3=H3, 0=без заглавие + images: list = field(default_factory=list) # list[ImageRef] + html_text: Optional[str] = None # rich HTML с [IMG: ...] placeholders + + +@dataclass +class ProcessedSection: + code: str # DOC_003_SEC_012 + source_file: str + title: str + keywords: str # "кл1, кл2, кл3" + text: str + images_json: str = "[]" # JSON масив с относителни пътища + html_text: str = "" # rich HTML (само за HTML-source файлове) + char_count: int = 0 + + def __post_init__(self): + self.char_count = len(self.text) + + +# ────────────────────────────────────────────── +# База данни +# ────────────────────────────────────────────── + +def _ensure_trust_server_certificate(conn_str: str) -> str: + """Добавя TrustServerCertificate=yes към connection string ако липсва.""" + if not conn_str: + return conn_str + if re.search(r"TrustServerCertificate\s*=", conn_str, re.IGNORECASE): + return conn_str + sep = "" if conn_str.rstrip().endswith(";") else ";" + return f"{conn_str}{sep}TrustServerCertificate=yes;" + + +class Database: + def __init__(self, conn_str: str): + self.conn_str = _ensure_trust_server_certificate(conn_str) + self.conn = pyodbc.connect(self.conn_str, autocommit=False) + self._ensure_schema() + + def _ensure_schema(self): + """Създава таблиците ако не съществуват.""" + cur = self.conn.cursor() + cur.execute(""" + IF NOT EXISTS (SELECT 1 FROM sys.tables WHERE name='RIP_help_files') + CREATE TABLE RIP_help_files ( + id INT IDENTITY PRIMARY KEY, + prefix NVARCHAR(50) NOT NULL DEFAULT 'HLP', + file_path NVARCHAR(1000) NOT NULL, + file_hash CHAR(64) NOT NULL, + processed_at DATETIME2 NOT NULL DEFAULT GETDATE(), + section_count INT NOT NULL DEFAULT 0, + CONSTRAINT UQ_RIP_help_files_prefix_path UNIQUE (prefix, file_path) + )""") + # Migrate: добавяме колонка prefix ако таблицата е по-стара версия + cur.execute(""" + IF NOT EXISTS ( + SELECT 1 FROM sys.columns + WHERE object_id=OBJECT_ID('RIP_help_files') AND name='prefix' + ) + BEGIN + ALTER TABLE RIP_help_files ADD prefix NVARCHAR(50) NOT NULL + CONSTRAINT DF_RIP_help_files_prefix DEFAULT 'HLP' WITH VALUES; + END + """) + # Migrate: ако има стара UNIQUE на file_path сама (без prefix), сваляме я + cur.execute(""" + DECLARE @c NVARCHAR(200); + SELECT @c = i.name FROM sys.indexes i + WHERE i.object_id=OBJECT_ID('RIP_help_files') + AND i.is_unique=1 + AND i.name <> 'UQ_RIP_help_files_prefix_path' + AND i.name NOT LIKE 'PK_%' + AND (SELECT COUNT(*) FROM sys.index_columns ic + WHERE ic.object_id=i.object_id AND ic.index_id=i.index_id) = 1; + IF @c IS NOT NULL EXEC('ALTER TABLE RIP_help_files DROP CONSTRAINT [' + @c + ']'); + """) + # Migrate: създаваме новата composite UNIQUE ако липсва + cur.execute(""" + IF NOT EXISTS ( + SELECT 1 FROM sys.indexes + WHERE name='UQ_RIP_help_files_prefix_path' + AND object_id=OBJECT_ID('RIP_help_files') + ) + ALTER TABLE RIP_help_files + ADD CONSTRAINT UQ_RIP_help_files_prefix_path UNIQUE (prefix, file_path) + """) + cur.execute(""" + IF NOT EXISTS (SELECT 1 FROM sys.tables WHERE name='RIP_help_sections') + CREATE TABLE RIP_help_sections ( + id INT IDENTITY PRIMARY KEY, + prefix NVARCHAR(50) NOT NULL DEFAULT 'HLP', + code NVARCHAR(80) NOT NULL UNIQUE, + source_file NVARCHAR(1000) NOT NULL, + title NVARCHAR(500), + keywords NVARCHAR(300), + char_count INT, + output_path NVARCHAR(1000), + images NVARCHAR(MAX), + created_at DATETIME2 NOT NULL DEFAULT GETDATE(), + updated_at DATETIME2 NOT NULL DEFAULT GETDATE() + )""") + # Migrate: добавяме колонка prefix ако таблицата е по-стара версия + cur.execute(""" + IF NOT EXISTS ( + SELECT 1 FROM sys.columns + WHERE object_id=OBJECT_ID('RIP_help_sections') AND name='prefix' + ) + ALTER TABLE RIP_help_sections ADD prefix NVARCHAR(50) NOT NULL + CONSTRAINT DF_RIP_help_sections_prefix DEFAULT 'HLP' WITH VALUES + """) + # Migrate: добавяме колонка 'images' ако таблицата е създадена по-стара версия + cur.execute(""" + IF NOT EXISTS ( + SELECT 1 FROM sys.columns + WHERE object_id=OBJECT_ID('RIP_help_sections') AND name='images' + ) + ALTER TABLE RIP_help_sections ADD images NVARCHAR(MAX) NULL + """) + # Migrate: добавяме колонка 'html_text' (rich HTML с форматиране) + cur.execute(""" + IF NOT EXISTS ( + SELECT 1 FROM sys.columns + WHERE object_id=OBJECT_ID('RIP_help_sections') AND name='html_text' + ) + ALTER TABLE RIP_help_sections ADD html_text NVARCHAR(MAX) NULL + """) + # Индекси за търсене по ключови думи и заглавие + cur.execute(""" + IF NOT EXISTS ( + SELECT 1 FROM sys.indexes + WHERE name='IX_RIP_help_sections_keywords' AND object_id=OBJECT_ID('RIP_help_sections') + ) + CREATE INDEX IX_RIP_help_sections_keywords ON RIP_help_sections(keywords) + """) + self.conn.commit() + log.info("Схемата е проверена / създадена.") + + def get_file_hash(self, prefix: str, file_path: str) -> Optional[str]: + cur = self.conn.cursor() + cur.execute( + "SELECT file_hash FROM RIP_help_files WHERE prefix=? AND file_path=?", + prefix, file_path + ) + row = cur.fetchone() + return row[0] if row else None + + def upsert_file(self, prefix: str, file_path: str, file_hash: str, section_count: int): + cur = self.conn.cursor() + cur.execute(""" + MERGE RIP_help_files AS t + USING (SELECT ? AS prefix, ? AS file_path, ? AS file_hash, ? AS section_count) AS s + ON t.prefix = s.prefix AND t.file_path = s.file_path + WHEN MATCHED THEN + UPDATE SET file_hash=s.file_hash, section_count=s.section_count, + processed_at=GETDATE() + WHEN NOT MATCHED THEN + INSERT (prefix, file_path, file_hash, section_count) + VALUES (s.prefix, s.file_path, s.file_hash, s.section_count); + """, prefix, file_path, file_hash, section_count) + self.conn.commit() + + def delete_sections_for_file(self, prefix: str, file_path: str): + cur = self.conn.cursor() + cur.execute( + "DELETE FROM RIP_help_sections WHERE prefix=? AND source_file=?", + prefix, file_path + ) + self.conn.commit() + + def all_source_files(self, prefix: str) -> list[str]: + """Връща всички source_file пътища за даден префикс.""" + cur = self.conn.cursor() + cur.execute(""" + SELECT file_path FROM RIP_help_files WHERE prefix=? + UNION + SELECT source_file FROM RIP_help_sections WHERE prefix=? + """, prefix, prefix) + return [r[0] for r in cur.fetchall()] + + def section_output_paths_for(self, prefix: str, source_files: list[str]) -> list[str]: + if not source_files: + return [] + cur = self.conn.cursor() + placeholders = ",".join("?" for _ in source_files) + cur.execute( + f"SELECT output_path FROM RIP_help_sections " + f"WHERE prefix=? AND source_file IN ({placeholders})", + prefix, *source_files + ) + return [r[0] for r in cur.fetchall() if r[0]] + + def purge_sources(self, prefix: str, source_files: list[str]) -> int: + if not source_files: + return 0 + cur = self.conn.cursor() + placeholders = ",".join("?" for _ in source_files) + cur.execute( + f"DELETE FROM RIP_help_sections " + f"WHERE prefix=? AND source_file IN ({placeholders})", + prefix, *source_files + ) + sec_deleted = cur.rowcount + cur.execute( + f"DELETE FROM RIP_help_files " + f"WHERE prefix=? AND file_path IN ({placeholders})", + prefix, *source_files + ) + self.conn.commit() + return sec_deleted + + def insert_section(self, prefix: str, ps: ProcessedSection, output_path: str): + cur = self.conn.cursor() + cur.execute(""" + MERGE RIP_help_sections AS t + USING (SELECT ? AS code) AS s ON t.code = s.code + WHEN MATCHED THEN + UPDATE SET prefix=?, source_file=?, title=?, keywords=?, + char_count=?, output_path=?, images=?, html_text=?, + updated_at=GETDATE() + WHEN NOT MATCHED THEN + INSERT (prefix, code, source_file, title, keywords, char_count, output_path, + images, html_text) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?); + """, + ps.code, # USING + prefix, ps.source_file, ps.title, ps.keywords, # UPDATE SET + ps.char_count, output_path, ps.images_json, ps.html_text, + prefix, ps.code, ps.source_file, ps.title, ps.keywords, # INSERT + ps.char_count, output_path, ps.images_json, ps.html_text) + self.conn.commit() + + def close(self): + self.conn.close() + + +# ────────────────────────────────────────────── +# Парсъри +# ────────────────────────────────────────────── + +def file_hash(path: Path) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(65536), b""): + h.update(chunk) + return h.hexdigest() + + +def _load_html_image(src: str, base_dir: Path) -> Optional[tuple[bytes, str]]: + """Връща (data, ext) или None. Пропуска HTTP/HTTPS.""" + if not src: + return None + s = src.strip() + if s.startswith("data:"): + # data:image/png;base64,XXXX + m = re.match(r"data:([^;]+);base64,(.+)$", s, re.DOTALL) + if not m: + return None + import base64 + try: + data = base64.b64decode(m.group(2)) + except Exception: + return None + return data, _ext_from_content_type(m.group(1)) + if s.startswith(("http://", "https://")): + return None # по правило пропускаме мрежови картинки + # локален път, относителен или абсолютен + p = (base_dir / s).resolve() if not Path(s).is_absolute() else Path(s) + try: + if p.is_file(): + data = p.read_bytes() + ext = p.suffix.lstrip(".").lower() or "png" + return data, ext + except Exception: + return None + return None + + +def _detect_html_encoding(raw: bytes) -> str: + """Връща име на encoding: BOM → chardet → fallback (utf-8 ако ASCII, иначе windows-1251).""" + # BOM-и + if raw.startswith(b"\xef\xbb\xbf"): + return "utf-8" + if raw.startswith((b"\xff\xfe", b"\xfe\xff")): + return "utf-16" + # chardet + try: + import chardet + det = chardet.detect(raw[:65536]) or {} + enc = (det.get("encoding") or "").lower() + conf = det.get("confidence", 0) or 0 + if enc and conf >= 0.6: + # нормализиране на често срещани имена + if enc in ("cp1251", "ms-cyrl", "windows-1251"): + return "windows-1251" + if enc.startswith("utf"): + return enc + return enc + except Exception: + pass + # fallback: ако байтовете изглеждат "над 127" (т.е. има не-ASCII), приемаме CP1251 + if any(b > 127 for b in raw[:8192]): + return "windows-1251" + return "utf-8" + + +_HTML_BLOCK_TAGS = ["h1", "h2", "h3", "h4", "h5", "h6", + "p", "ul", "ol", "table", "dl", "pre", + "blockquote", "figure", "hr"] +_HTML_DROP_ATTRS = ("class", "style", "id", "lang", "dir", "align", + "valign", "width", "height", "bgcolor", "border") + + +def _strip_attrs(el): + """Премахва decorative атрибути (class, style, on*, data-*).""" + for t in el.find_all(True): + for a in list(t.attrs): + if a in _HTML_DROP_ATTRS or a.startswith("on") or a.startswith("data-"): + del t[a] + + +def _swap_imgs_in_block(el, base_dir: Path, sec_images: list, img_counter: list) -> None: + """Намира всички в подадения елемент, извлича данните и подменя с + NavigableString placeholder ([IMG: img_NN]).""" + from bs4 import NavigableString + for img in el.find_all("img"): + src = img.get("src") or img.get("data-src") or "" + loaded = _load_html_image(src, base_dir) + if not loaded: + img.decompose() + continue + data, ext = loaded + if not _should_keep_image(data): + img.decompose() + continue + img_counter[0] += 1 + ref = ImageRef(placeholder=f"img_{img_counter[0]:02d}", data=data, ext=ext) + sec_images.append(ref) + img.replace_with(NavigableString(f"[IMG: {ref.placeholder}]")) + + +def parse_html(path: Path) -> list[Section]: + raw = path.read_bytes() + enc = _detect_html_encoding(raw) + log.debug(f" {path.name} encoding: {enc}") + try: + soup = BeautifulSoup(raw, "lxml", from_encoding=enc) + except Exception: + soup = BeautifulSoup(raw, "lxml") + + # Премахваме скриптове и стилове + for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]): + tag.decompose() + + base_dir = path.parent + body = soup.body or soup + + heading_map = {"h1": 1, "h2": 2, "h3": 3, "h4": 3, "h5": 3, "h6": 3} + + # Събираме top-level блокови елементи (без да включваме вложените в тях) + consumed = set() + blocks = [] + for el in body.find_all(_HTML_BLOCK_TAGS + ["img"]): + if any(id(par) in consumed for par in el.parents): + continue + consumed.add(id(el)) + blocks.append(el) + + sections: list[Section] = [] + current_title = "" + current_level = 1 + sec_text: list[str] = [] + sec_html: list[str] = [] + sec_images: list[ImageRef] = [] + img_counter = [0] + + def flush(): + if sec_text or sec_html or sec_images: + sec = Section(current_title, "\n".join(sec_text), current_level) + sec.images = list(sec_images) + sec.html_text = "\n".join(sec_html) if sec_html else None + sections.append(sec) + + for el in blocks: + if el.name in heading_map: + txt = el.get_text(" ", strip=True) + if not txt: + continue + flush() + current_title = txt + current_level = heading_map[el.name] + sec_text, sec_html, sec_images = [], [], [] + continue + + if el.name == "img": + # самостоятелен (не вътре в блок) + _swap_imgs_in_block(el.parent if el.parent and el.parent.name else el, + base_dir, sec_images, img_counter) + # ако е заменен с placeholder, добавяме като текст + txt = el.get_text(" ", strip=True) if el.name else "" + if txt: + sec_text.append(txt) + sec_html.append(f"

{txt}

") + continue + + _swap_imgs_in_block(el, base_dir, sec_images, img_counter) + _strip_attrs(el) + txt = el.get_text(" ", strip=True) + if txt: + sec_text.append(txt) + try: + sec_html.append(str(el)) + except Exception: + pass + + flush() + + if not sections: + plain = body.get_text(" ", strip=True) + return [Section("", plain, 0)] + return sections + + +def _extract_docx_paragraph_images(para, doc) -> list[ImageRef]: + """Намира drawing-и в параграф; връща ImageRef-и за филтрираните по размер.""" + from docx.oxml.ns import qn + imgs: list[ImageRef] = [] + try: + blips = para._element.findall(".//" + qn("a:blip")) + except Exception: + return imgs + + embed_attr = qn("r:embed") + for blip in blips: + rId = blip.get(embed_attr) + if not rId: + continue + try: + part = doc.part.related_parts[rId] + data = part.blob + ct = getattr(part, "content_type", "") or "" + except Exception: + continue + if not _should_keep_image(data): + continue + ext = _ext_from_content_type(ct) + imgs.append(ImageRef(placeholder=f"__IMG_{len(imgs)+1}__", data=data, ext=ext)) + return imgs + + +def parse_docx(path: Path) -> list[Section]: + doc = Document(path) + sections: list[Section] = [] + current_title, current_level = "", 1 + buf: list[str] = [] + sec_images: list[ImageRef] = [] + img_counter = [0] # списък за nonlocal-стил мутация + + HEADING_STYLES = {"heading 1": 1, "heading 2": 2, "heading 3": 3, + "title": 1, "subtitle": 2} + + def flush(): + if buf or sec_images: + sec = Section(current_title, "\n".join(buf), current_level) + sec.images = list(sec_images) + sections.append(sec) + + for para in doc.paragraphs: + style_name = para.style.name.lower() if para.style else "" + text = para.text.strip() + para_imgs = _extract_docx_paragraph_images(para, doc) + + if not text and not para_imgs: + continue + + level = HEADING_STYLES.get(style_name) + is_bold_heading = bool(text and len(text) < 120 and not style_name.startswith("list") + and para.runs + and all(run.bold for run in para.runs if run.text.strip())) + + if level or (is_bold_heading and not para_imgs): + flush() + buf, sec_images = [], [] + current_title = text + current_level = level or 2 + continue + + if text: + buf.append(text) + for im in para_imgs: + img_counter[0] += 1 + im.placeholder = f"img_{img_counter[0]:02d}" + sec_images.append(im) + buf.append(f"[IMG: {im.placeholder}]") + + flush() + + if not sections: + fallback_text = "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + return [Section("", fallback_text, 0)] + return sections + + +def _convert_doc_with_libreoffice(path: Path, out_dir: Path) -> Optional[Path]: + try: + subprocess.run( + ["soffice", "--headless", "--convert-to", "docx", + "--outdir", str(out_dir), str(path)], + check=True, capture_output=True, timeout=60 + ) + except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e: + log.debug(f"LibreOffice конверсия неуспешна: {e}") + return None + out = list(out_dir.glob("*.docx")) + return out[0] if out else None + + +def _convert_doc_with_word(path: Path, out_dir: Path) -> Optional[Path]: + """Fallback: ползва MS Word през COM на Windows.""" + try: + import win32com.client # noqa: F401 + import pythoncom + except ImportError: + log.debug("pywin32 не е инсталиран — MS Word fallback недостъпен.") + return None + + import win32com.client as wcc + pythoncom.CoInitialize() + word = None + doc = None + try: + word = wcc.DispatchEx("Word.Application") + word.Visible = False + word.DisplayAlerts = False + doc = word.Documents.Open(str(path.resolve()), ReadOnly=True) + out_path = out_dir / (path.stem + ".docx") + # FileFormat=16 → wdFormatXMLDocument (.docx) + doc.SaveAs2(str(out_path.resolve()), FileFormat=16) + return out_path if out_path.exists() else None + except Exception as e: + log.debug(f"MS Word конверсия неуспешна: {e}") + return None + finally: + try: + if doc is not None: + doc.Close(SaveChanges=False) + except Exception: + pass + try: + if word is not None: + word.Quit() + except Exception: + pass + pythoncom.CoUninitialize() + + +def parse_doc_old(path: Path) -> list[Section]: + """Конвертира стар .doc до .docx чрез LibreOffice или MS Word, после парси.""" + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + + converted = _convert_doc_with_libreoffice(path, tmp_dir) + engine = "LibreOffice" + + if not converted: + converted = _convert_doc_with_word(path, tmp_dir) + engine = "MS Word" + + if not converted: + log.warning( + f"Нито LibreOffice, нито MS Word успяха да конвертират {path.name}. " + f"Пробваме като текст." + ) + return parse_txt(path) + + log.info(f" {path.name} конвертиран чрез {engine}") + return parse_docx(converted) + + +def _render_pdf_image(page, img_info, resolution: int = 150) -> Optional[bytes]: + """Кропва картинката от PDF страницата и я записва като PNG bytes.""" + try: + x0 = float(img_info.get("x0", 0)) + x1 = float(img_info.get("x1", 0)) + top = float(img_info.get("top", img_info.get("y0", 0))) + bot = float(img_info.get("bottom", img_info.get("y1", 0))) + if x1 <= x0 or bot <= top: + return None + # ограничаваме до страницата (pdfplumber иначе хвърля) + x0 = max(0, x0); top = max(0, top) + x1 = min(page.width, x1); bot = min(page.height, bot) + if x1 - x0 < 1 or bot - top < 1: + return None + cropped = page.crop((x0, top, x1, bot)) + pil = cropped.to_image(resolution=resolution).original + from io import BytesIO + buf = BytesIO() + pil.save(buf, format="PNG") + return buf.getvalue() + except Exception as e: + log.debug(f"PDF image render failed: {e}") + return None + + +def parse_pdf(path: Path) -> list[Section]: + if not HAS_PDF: + log.warning("pdfplumber не е инсталиран. PDF се прескача.") + return [] + + sections: list[Section] = [] + current_title = "" + buf: list[str] = [] + sec_images: list[ImageRef] = [] + img_counter = [0] + prev_size = None + + def flush(): + if buf or sec_images: + sec = Section(current_title, "\n".join(buf), 2) + sec.images = list(sec_images) + sections.append(sec) + + with pdfplumber.open(path) as pdf: + for page in pdf.pages: + # Картинките за страницата (сортирани по y отгоре надолу) + page_images = sorted( + page.images or [], + key=lambda im: float(im.get("top", im.get("y0", 0))) + ) + img_queue = [] + for im in page_images: + data = _render_pdf_image(page, im) + if not data or not _should_keep_image(data): + continue + img_queue.append((float(im.get("top", 0)), data)) + + words = page.extract_words(extra_attrs=["size"]) + line_buf, line_size = [], None + + def emit_images_before(y: float): + while img_queue and img_queue[0][0] <= y: + _, data = img_queue.pop(0) + img_counter[0] += 1 + ref = ImageRef(placeholder=f"img_{img_counter[0]:02d}", + data=data, ext="png") + sec_images.append(ref) + buf.append(f"[IMG: {ref.placeholder}]") + + for w in words: + sz = round(float(w.get("size", 10)), 1) + y = float(w.get("top", 0)) + if line_size is None: + line_size = sz + if abs(sz - line_size) > 1: + line_text = " ".join(line_buf).strip() + if line_text: + if line_size > (prev_size or 10) + 1 and len(line_text) < 150: + flush() + buf, sec_images = [], [] + current_title = line_text + else: + emit_images_before(y) + buf.append(line_text) + prev_size = line_size + line_buf, line_size = [w["text"]], sz + else: + line_buf.append(w["text"]) + + if line_buf: + emit_images_before(page.height) + buf.append(" ".join(line_buf)) + + # картинките след всичкия текст на страницата + emit_images_before(page.height + 1) + + flush() + + return sections or [Section("", "", 0)] + + +def parse_txt(path: Path) -> list[Section]: + import chardet + raw = path.read_bytes() + enc = chardet.detect(raw)["encoding"] or "utf-8" + text = raw.decode(enc, errors="replace") + return [Section("", text, 0)] + + +PARSERS = { + ".html": parse_html, + ".htm": parse_html, + ".docx": parse_docx, + ".doc": parse_doc_old, + ".txt": parse_txt, + ".pdf": parse_pdf, +} + + +# ────────────────────────────────────────────── +# Сегментиране и почистване +# ────────────────────────────────────────────── + +def merge_short_sections(sections: list[Section]) -> list[Section]: + """Слива секции, по-кратки от MIN_SECTION_TOKENS думи, с предишната.""" + result: list[Section] = [] + for sec in sections: + words = len(sec.text.split()) + if result and words < MIN_SECTION_TOKENS: + prev = result[-1] + merged = Section( + prev.title, + prev.text + "\n" + sec.text, + prev.level, + ) + merged.images = (prev.images or []) + (sec.images or []) + html_parts = [h for h in (prev.html_text, sec.html_text) if h] + merged.html_text = "\n".join(html_parts) if html_parts else None + result[-1] = merged + else: + result.append(sec) + return result + + +def clean_text(text: str) -> str: + text = re.sub(r"\s+", " ", text) + text = re.sub(r" {2,}", " ", text) + return text.strip() + + +# ────────────────────────────────────────────── +# AI класификация +# ────────────────────────────────────────────── + +def classify_section(client: anthropic.Anthropic, title: str, text: str) -> tuple[str, str]: + """Връща (наименование, 'кл1, кл2, кл3') чрез Claude.""" + snippet = text[:MAX_AI_CHARS] + prompt = f"""Анализирай следната секция от help-документация и върни JSON обект с два ключа: +- "title": кратко наименование на секцията (до 8 думи, на езика на текста) +- "keywords": списък от до 5 ключови думи/фрази, разделени със запетая (на езика на текста) + +Съществуващо заглавие (може да е празно): {title!r} + +Текст: +{snippet} + +Върни САМО валиден JSON без markdown, без коментари.""" + + msg = client.messages.create( + model=AI_MODEL, + max_tokens=200, + messages=[{"role": "user", "content": prompt}] + ) + raw = msg.content[0].text.strip() + raw = re.sub(r"^```[a-z]*\n?", "", raw) + raw = re.sub(r"\n?```$", "", raw) + + try: + data = json.loads(raw) + t = str(data.get("title", title or "Секция"))[:200] + k = str(data.get("keywords", ""))[:300] + return t, k + except json.JSONDecodeError: + log.warning(f"AI върна невалиден JSON: {raw[:120]}") + return title or "Секция", "" + + +# ────────────────────────────────────────────── +# Генериране на кодове +# ────────────────────────────────────────────── + +def make_code(prefix: str, file_index: int, sec_index: int) -> str: + return f"{prefix}_{file_index:04d}_SEC_{sec_index:04d}" + + +# ────────────────────────────────────────────── +# Основна обработка +# ────────────────────────────────────────────── + +def process_file( + path: Path, + file_index: int, + db: Database, + client: anthropic.Anthropic, + output_dir: Path, + prefix: str = "HLP", + force: bool = False, +) -> int: + """Обработва един файл. Връща броя записани секции (0 = пропуснат).""" + rel = str(path) + fh = file_hash(path) + + if not force: + stored = db.get_file_hash(prefix, rel) + if stored == fh: + log.info(f" [SKIP] {path.name} (непроменен)") + return 0 + + log.info(f" [PROC] {path.name}") + ext = path.suffix.lower() + parser = PARSERS.get(ext) + if not parser: + log.warning(f" Неподдържан формат: {ext}") + return 0 + + try: + sections = parser(path) + except Exception as e: + log.error(f" Грешка при парсване: {e}") + return 0 + + sections = merge_short_sections(sections) + + # Изтриваме старите секции за файла при повторна обработка + db.delete_sections_for_file(prefix, rel) + + images_dir = output_dir / "images" + images_dir.mkdir(parents=True, exist_ok=True) + + saved = 0 + for i, sec in enumerate(sections, 1): + text = clean_text(sec.text) + html_text = sec.html_text or "" + if not text and not sec.images and not html_text: + continue + + code = make_code(prefix, file_index, i) + + # Записваме картинките на диск и заменяме placeholder-ите в текста + HTML + image_rel_paths: list[str] = [] + for ref in sec.images or []: + fname = f"{code}_{ref.placeholder}.{ref.ext}" + disk_path = images_dir / fname + try: + disk_path.write_bytes(ref.data) + except Exception as e: + log.warning(f" Грешка при запис на картинка {fname}: {e}") + continue + rel_path = f"images/{fname}" + image_rel_paths.append(rel_path) + old_ph = f"[IMG: {ref.placeholder}]" + new_ph = f"[IMG: {rel_path}]" + text = text.replace(old_ph, new_ph) + html_text = html_text.replace(old_ph, new_ph) + + # Премахваме placeholder-и, останали без файл + text = _IMG_PLACEHOLDER_RE.sub( + lambda m: m.group(0) if "/" in m.group(1) or "\\" in m.group(1) else "", + text + ).strip() + html_text = _IMG_PLACEHOLDER_RE.sub( + lambda m: m.group(0) if "/" in m.group(1) or "\\" in m.group(1) else "", + html_text + ).strip() + if not text and not image_rel_paths and not html_text: + continue + + try: + title, keywords = classify_section(client, sec.title, text) + except Exception as e: + log.warning(f" AI грешка за {code}: {e}") + title, keywords = sec.title or f"Секция {i}", "" + + images_json = json.dumps(image_rel_paths, ensure_ascii=False) + ps = ProcessedSection( + code=code, + source_file=rel, + title=title, + keywords=keywords, + text=text, + images_json=images_json, + html_text=html_text, + ) + + # Записваме текста в изходна директория + out_path = output_dir / f"{code}.txt" + out_path.write_text( + f"КОД: {code}\nФАЙЛ: {rel}\nЗАГЛАВИЕ: {title}\nКЛЮЧОВИ ДУМИ: {keywords}\n" + f"КАРТИНКИ: {len(image_rel_paths)}\n" + f"{'─'*60}\n{text}", + encoding="utf-8" + ) + + db.insert_section(prefix, ps, str(out_path)) + saved += 1 + log.debug(f" {code}: {title[:60]} ({len(image_rel_paths)} img)") + + db.upsert_file(prefix, rel, fh, saved) + log.info(f" → {saved} секции записани") + return saved + + +_PREFIX_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_]{0,49}$") + + +def process_directory( + input_dir: Path, + output_dir: Path, + conn_str: str, + api_key: str, + prefix: str = "HLP", + force: bool = False, + purge_missing: bool = False, +): + if not _PREFIX_RE.match(prefix): + raise ValueError( + f"Невалиден prefix {prefix!r}. Допустими: буква + букви/цифри/подчертавки, до 50 символа." + ) + + output_dir.mkdir(parents=True, exist_ok=True) + db = Database(conn_str) + client = anthropic.Anthropic(api_key=api_key) + + extensions = set(PARSERS.keys()) + output_resolved = output_dir.resolve() + + def _under_output(p: Path) -> bool: + try: + p.resolve().relative_to(output_resolved) + return True + except ValueError: + return False + + files = [ + p for p in input_dir.rglob("*") + if p.is_file() and p.suffix.lower() in extensions and not _under_output(p) + ] + log.info(f"Prefix={prefix} Намерени {len(files)} файла в {input_dir}") + + current_paths = {str(p) for p in files} + total_sections = 0 + try: + for idx, path in enumerate(sorted(files), 1): + n = process_file(path, idx, db, client, output_dir, + prefix=prefix, force=force) + total_sections += n + + if purge_missing: + existing = set(db.all_source_files(prefix)) + orphans = sorted(existing - current_paths) + if not orphans: + log.info(f"Purge: няма orphan записи в БД за prefix={prefix}.") + else: + log.info(f"Purge ({prefix}): намерени {len(orphans)} orphan източника:") + for o in orphans: + log.info(f" - {o}") + disk_paths = db.section_output_paths_for(prefix, orphans) + removed_files = 0 + for op in disk_paths: + try: + opath = Path(op) + if opath.exists(): + opath.unlink() + removed_files += 1 + code = opath.stem + for img in (output_dir / "images").glob(f"{code}_*"): + try: + img.unlink() + removed_files += 1 + except Exception: + pass + except Exception as e: + log.debug(f" не успях да изтрия {op}: {e}") + deleted = db.purge_sources(prefix, orphans) + log.info(f"Purge: изтрити {deleted} секции от БД, {removed_files} файла от диска.") + finally: + db.close() + + log.info(f"Готово. Prefix={prefix}. Общо нови/обновени секции: {total_sections}") + + +# ────────────────────────────────────────────── +# CLI +# ────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description="Help-файл декомпозитор с SQL Server + Anthropic" + ) + parser.add_argument("input_dir", help="Входна директория с help-файлове") + parser.add_argument("output_dir", help="Изходна директория за текстови секции") + parser.add_argument( + "--conn", + default=os.getenv("HELP_DB_CONN"), + help="SQL Server connection string (или HELP_DB_CONN env var)" + ) + parser.add_argument( + "--api-key", + default=os.getenv("ANTHROPIC_API_KEY"), + help="Anthropic API ключ (или ANTHROPIC_API_KEY env var)" + ) + parser.add_argument( + "--prefix", + default=os.getenv("HELP_PREFIX", "HLP"), + help="Префикс за кодовете/scope в БД (буква + букви/цифри/_, до 50 знака). " + "Default: 'HLP' (или env HELP_PREFIX)." + ) + parser.add_argument( + "--force", + action="store_true", + help="Преобработва всички файлове, независимо от hash" + ) + parser.add_argument( + "--purge-missing", + action="store_true", + help="След обработката изтрива от БД и диска секциите за източници, " + "които вече не съществуват във входната директория (само в дадения prefix)" + ) + args = parser.parse_args() + + if not args.api_key: + sys.exit("Грешка: липсва Anthropic API ключ (--api-key или ANTHROPIC_API_KEY).") + if not args.conn: + sys.exit("Грешка: липсва SQL Server connection string (--conn или HELP_DB_CONN).") + + process_directory( + input_dir=Path(args.input_dir), + output_dir=Path(args.output_dir), + conn_str=args.conn, + api_key=args.api_key, + prefix=args.prefix, + force=args.force, + purge_missing=args.purge_missing, + ) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..157aa1e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +anthropic>=0.25.0 +pyodbc>=5.0.0 +python-docx>=1.1.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +pdfplumber>=0.11.0 +chardet>=5.0.0 diff --git a/save_keywords.py b/save_keywords.py new file mode 100644 index 0000000..58a714b --- /dev/null +++ b/save_keywords.py @@ -0,0 +1,77 @@ +""" +save_keywords.py +================ +Чете keywords_changes.json (генериран от браузъра) +и записва промените в SQL Server. + +Стартирай с: python save_keywords.py +""" + +import os, sys, json +from pathlib import Path +from datetime import datetime + +try: + import pyodbc +except ImportError: + sys.exit("Инсталирай pyodbc: pip install pyodbc") + +CONN_STR = os.getenv( + "HELP_DB_CONN", + "DRIVER={ODBC Driver 18 for SQL Server};" + "TrustServerCertificate=yes;" + "SERVER=94.26.63.238,13151;DATABASE=blondina;" + "UID=blondina_login;PWD=blondina_parola_123" +) +CHANGES_FILE = Path(__file__).parent / "keywords_changes.json" + + +def main(): + if not CHANGES_FILE.exists(): + print("Файлът keywords_changes.json не е намерен.") + print("Запази промените от браузъра първо.") + return + + changes = json.loads(CHANGES_FILE.read_text(encoding="utf-8")) + if not changes: + print("Няма промени за запис.") + return + + print(f"Записвам {len(changes)} промени в БД...") + conn = pyodbc.connect(CONN_STR, autocommit=False) + cur = conn.cursor() + ok, err = 0, 0 + + for item in changes: + code = item.get("code", "").strip() + keywords = item.get("keywords", "").strip() + if not code: + continue + try: + cur.execute( + "UPDATE RIP_help_sections SET keywords=?, updated_at=GETDATE() WHERE code=?", + keywords, code + ) + if cur.rowcount > 0: + ok += 1 + print(f" ✓ {code}") + else: + print(f" ? {code} — не е намерен в БД") + except Exception as e: + print(f" ✗ {code} — {e}") + err += 1 + + conn.commit() + conn.close() + + print(f"\nГотово: {ok} записани, {err} грешки.") + + # Архивираме файла + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + archive = CHANGES_FILE.parent / f"keywords_changes_{ts}.json" + CHANGES_FILE.rename(archive) + print(f"Файлът е архивиран като: {archive.name}") + + +if __name__ == "__main__": + main() diff --git a/view.bat b/view.bat new file mode 100644 index 0000000..2666db2 --- /dev/null +++ b/view.bat @@ -0,0 +1,21 @@ +:@echo off +chcp 65001 > nul +call "%~dp0_load_env.bat" || exit /b 1 +set PYTHONIOENCODING=utf-8 + +rem Optional: %1 = prefix filter (e.g. RIP, INEX_TM). Empty = show all. +if "%~1"=="" ( + echo Generate help_viewer.html from DB ^(all prefixes^) + python generate_html.py +) else ( + echo Generate help_viewer.html from DB ^(prefix=%~1^) + python generate_html.py --prefix=%~1 +) + +echo. +echo ok. browser should be open +echo. +echo to write changes in key words back into DB +echo python save_keywords.py +echo. +pause