"""Тип	Идентификатор
Массив	51e7a0d2-530b-11d4-b98a-008048da3034
Структура	4238019d-7e49-4fc9-91db-b6b951d5cf8e
Соответствие	3d48feae-a9c6-4c5a-a099-9eb6477630c6
СписокЗначений	4772b3b4-f4a3-49c0-a1a5-8cb5961511a3
ТаблицаЗначений	acf6192e-81ca-46ef-93a6-5a6968b78663
ДеревоЗначений	e603c0f2-92fb-4d47-8f38-a44a381cf235
ФиксированныйМассив	4500381b-db30-4a10-9db4-990038032acf
ФиксированнаяСтруктура	3ee983d7-ace7-40f9-bb7e-2e916fcddd56
ФиксированноеСоответствие	220455ea-6c85-4513-996f-bbe79ed07774

Источник: https://infostart.ru/1c/articles/1116103/

"""
 


import re
import sys
from pathlib import Path
from collections import Counter

DUMP_FILE = "rphost.exe_16460.dmp"
OUT_DIR = Path("decoded_1c_dump")

MIN_ASCII_RUN = 80
MIN_UTF16_RUN = 40

KEYWORDS_1C = [
    "Процедура",
    "Функция",
    "КонецПроцедуры",
    "КонецФункции",
    "Перем",
    "Экспорт",
    "Если",
    "Тогда",
    "Иначе",
    "КонецЕсли",
    "Для Каждого",
    "Цикл",
    "КонецЦикла",
    "Пока",
    "Новый Запрос",
    "Запрос.Текст",
    "ВЫБРАТЬ",
    "ИЗ",
    "ГДЕ",
    "ПараметрыСеанса",
    "Соответствие",
    "ФиксированноеСоответствие",
    "ТаблицаЗначений",
    "РезультатЗапроса",
    "Выборка",
]

# printable-ish bytes for raw text extraction
RAW_ASCII_RE = re.compile(rb"[\x09\x0A\x0D\x20-\x7E\x80-\xFF]{%d,}" % MIN_ASCII_RUN)
UTF16_RE = re.compile(rb"(?:[\x09\x0A\x0D\x20-\x7E\x80-\xFF]\x00){%d,}" % MIN_UTF16_RUN)

BASE64_RE = re.compile(
    r"(?:[A-Za-z0-9+/]{80,}={0,2})"
)

def ensure_stdout_utf8():
    try:
        sys.stdout.reconfigure(encoding="utf-8")
    except Exception:
        pass

def safe_text(s: str) -> str:
    return s.replace("\x00", "")

def fix_cp1251_mojibake(text: str) -> str:
    """
    Turns 'РџРѕРєР°' into 'Пока' when source was UTF-8 bytes misread as cp1251.
    """
    try:
        fixed = text.encode("cp1251", errors="strict").decode("utf-8", errors="strict")
        return fixed
    except Exception:
        return text

def score_1c(text: str) -> int:
    score = 0
    for kw in KEYWORDS_1C:
        if kw in text:
            score += 10
    score += text.count("Процедура") * 20
    score += text.count("Функция") * 20
    score += text.count("ВЫБРАТЬ") * 10
    score += text.count("Пока") * 5
    score += text.count("Цикл") * 5
    score += text.count("ПараметрыСеанса") * 10
    score += text.count("Соответствие") * 10
    return score

def looks_interesting(text: str) -> bool:
    low = text.lower()
    # 1C or query or metadata signals
    interesting_markers = [
        "процедура", "функция", "конецпроцедуры", "конецфункции",
        "новый запрос", "запрос.текст", "выбрать", "параметрысеанса",
        "соответствие", "фиксированноесоответствие", "таблицазначений",
        "регист", "документ.", "справочник.", "общиймодуль.",
        "form", "pattern", "xml", "base64", "http://v8.1c.ru",
    ]
    return any(m in low for m in interesting_markers)

def normalize_text(text: str) -> str:
    text = safe_text(text)
    fixed = fix_cp1251_mojibake(text)
    return fixed

def extract_candidates(data: bytes):
    candidates = []

    # 1) raw byte runs
    for m in RAW_ASCII_RE.finditer(data):
        raw = m.group(0)
        offset = m.start()

        # try utf-8
        for enc_name, decoder in [
            ("utf-8", lambda b: b.decode("utf-8", errors="ignore")),
            ("cp1251", lambda b: b.decode("cp1251", errors="ignore")),
            ("latin1", lambda b: b.decode("latin1", errors="ignore")),
        ]:
            text = normalize_text(decoder(raw))
            if len(text) >= 60 and looks_interesting(text):
                candidates.append({
                    "offset": offset,
                    "encoding": enc_name,
                    "kind": "raw",
                    "text": text,
                })

    # 2) utf-16le runs
    for m in UTF16_RE.finditer(data):
        raw = m.group(0)
        offset = m.start()
        try:
            text = raw.decode("utf-16le", errors="ignore")
        except Exception:
            continue

        text = normalize_text(text)
        if len(text) >= 40 and looks_interesting(text):
            candidates.append({
                "offset": offset,
                "encoding": "utf-16le",
                "kind": "utf16",
                "text": text,
            })

    return candidates

def dedupe_candidates(candidates):
    seen = set()
    unique = []
    for c in candidates:
        key = (c["text"][:500], c["kind"])
        if key in seen:
            continue
        seen.add(key)
        unique.append(c)
    return unique

def save_outputs(candidates):
    OUT_DIR.mkdir(exist_ok=True)

    ranked = sorted(
        candidates,
        key=lambda c: (score_1c(c["text"]), len(c["text"])),
        reverse=True
    )

    # summary
    summary_path = OUT_DIR / "00_summary.txt"
    with summary_path.open("w", encoding="utf-8") as f:
        for i, c in enumerate(ranked[:200], start=1):
            score = score_1c(c["text"])
            preview = c["text"][:300].replace("\n", "\\n")
            f.write(
                f"[{i}] score={score} offset=0x{c['offset']:X} "
                f"enc={c['encoding']} kind={c['kind']} len={len(c['text'])}\n"
            )
            f.write(preview + "\n\n")

    # full chunks
    for i, c in enumerate(ranked[:200], start=1):
        score = score_1c(c["text"])
        name = f"{i:03d}_score{score}_off_0x{c['offset']:X}_{c['kind']}_{c['encoding']}.txt"
        path = OUT_DIR / name
        with path.open("w", encoding="utf-8") as f:
            f.write(c["text"])

    # best guessed code/query only
    code_candidates = [
        c for c in ranked
        if score_1c(c["text"]) >= 20
    ]
    code_path = OUT_DIR / "01_best_1c_code_candidates.txt"
    with code_path.open("w", encoding="utf-8") as f:
        for i, c in enumerate(code_candidates[:100], start=1):
            score = score_1c(c["text"])
            f.write("=" * 100 + "\n")
            f.write(
                f"[{i}] score={score} offset=0x{c['offset']:X} "
                f"enc={c['encoding']} kind={c['kind']} len={len(c['text'])}\n"
            )
            f.write("-" * 100 + "\n")
            f.write(c["text"] + "\n\n")

    # base64 hits inside candidates
    b64_path = OUT_DIR / "02_base64_hits.txt"
    with b64_path.open("w", encoding="utf-8") as f:
        for c in ranked[:200]:
            hits = BASE64_RE.findall(c["text"])
            if hits:
                f.write(
                    f"offset=0x{c['offset']:X} enc={c['encoding']} "
                    f"kind={c['kind']} hits={len(hits)}\n"
                )
                for h in hits[:5]:
                    f.write(h[:300] + "\n")
                f.write("\n")

    return ranked

def main():
    ensure_stdout_utf8()
    dump_path = Path(DUMP_FILE)
    if not dump_path.exists():
        print(f"File not found: {dump_path}")
        return

    print(f"Loading {dump_path} ...")
    data = dump_path.read_bytes()
    print(f"Loaded {len(data):,} bytes")

    print("Extracting candidates ...")
    candidates = extract_candidates(data)
    print(f"Raw candidates: {len(candidates):,}")

    candidates = dedupe_candidates(candidates)
    print(f"Unique candidates: {len(candidates):,}")

    ranked = save_outputs(candidates)
    print(f"Saved results to: {OUT_DIR.resolve()}")

    print("\nTop 20:")
    for i, c in enumerate(ranked[:20], start=1):
        score = score_1c(c["text"])
        preview = c["text"][:180].replace("\n", " ")
        print(
            f"[{i}] score={score} off=0x{c['offset']:X} "
            f"enc={c['encoding']} kind={c['kind']} len={len(c['text'])}"
        )
        print(f"    {preview}")

if __name__ == "__main__":
    main()
