import re
import math
from collections import Counter

FILE = "rphost.exe_16460.dmp"

CHUNK_SIZE = 1024 * 1024  # 1MB
TOP_REGIONS = 15

# --- REGEX ---
UTF16_STR = re.compile(rb'(?:[\x20-\x7E]\x00){5,}')
DATE_UTF16 = re.compile(rb'(?:\d\x00){4}[-\.](?:\d\x00){2}[-\.](?:\d\x00){2}')
DATE_COMPACT = re.compile(rb'(?:\d\x00){8}')


# --- HELPERS ---

def entropy(chunk):
    if not chunk:
        return 0
    counts = Counter(chunk)
    total = len(chunk)
    return -sum((c/total) * math.log2(c/total) for c in counts.values())


def extract_strings(region):
    raw = UTF16_STR.findall(region)
    return [s.decode('utf-16le', errors='ignore') for s in raw]


def count_dates_raw(region):
    return len(DATE_UTF16.findall(region)) + len(DATE_COMPACT.findall(region))


def score_region(text_density, data_density, entropy_val, repeat_score, date_count):
    return (
        text_density * 0.5 +
        data_density * 0.5 +
        entropy_val / 10 +
        repeat_score / 1000 +
        date_count / 1000
    )


print("Loading dump...")
with open(FILE, "rb") as f:
    data = f.read()

print("Scanning chunks...")

regions = []

# --- FIRST PASS ---
for i in range(0, len(data), CHUNK_SIZE):
    chunk = data[i:i+CHUNK_SIZE]

    if not chunk:
        continue

    # TEXT density
    text_count = sum(
        1 for j in range(0, len(chunk)-1, 2)
        if 32 <= chunk[j] <= 126 and chunk[j+1] == 0
    )
    text_density = text_count / (len(chunk)/2)

    # DATA density
    non_zero = sum(1 for b in chunk if b != 0)
    data_density = non_zero / len(chunk)

    # ENTROPY
    e = entropy(chunk)

    # DATE count (raw!)
    date_count = count_dates_raw(chunk)

    regions.append({
        "offset": i,
        "text_density": text_density,
        "data_density": data_density,
        "entropy": e,
        "date_count": date_count
    })

print("Ranking regions...")

# --- SECOND PASS (углубленный анализ ТОП регионов) ---
regions.sort(key=lambda r: (
    r["data_density"] + r["entropy"] + r["date_count"]
), reverse=True)

final = []

for r in regions[:TOP_REGIONS * 3]:
    start = r["offset"]
    region = data[start:start + CHUNK_SIZE * 2]

    strings = extract_strings(region)

    short = [s for s in strings if 5 <= len(s) <= 25]
    counter = Counter(short)

    repeat_score = sum(c for s, c in counter.items() if c > 10)

    score = score_region(
        r["text_density"],
        r["data_density"],
        r["entropy"],
        repeat_score,
        r["date_count"]
    )

    final.append({
        "offset": start,
        "score": score,
        "strings": strings,
        "counter": counter,
        "repeat_score": repeat_score,
        "date_count": r["date_count"],
        "text_density": r["text_density"],
        "data_density": r["data_density"],
        "entropy": r["entropy"]
    })

# --- SORT FINAL ---
final.sort(key=lambda x: x["score"], reverse=True)

print("\n=== FINAL ANALYSIS ===")

for r in final[:TOP_REGIONS]:
    print("\n====================================")
    print(f"Region: {hex(r['offset'])}")
    print(f"Score: {r['score']:.2f}")
    print(f"text={r['text_density']:.2f} data={r['data_density']:.2f} entropy={r['entropy']:.2f}")
    print(f"repeat_score={r['repeat_score']} date_count={r['date_count']}")

    strings = r["strings"]

    # --- TOP LONG ---
    print("\nTop long strings:")
    for s in sorted(strings, key=len, reverse=True)[:5]:
        print(f"  len={len(s)}: {s[:120]}")

    # --- REPEATS ---
    print("\nTop repeated:")
    for s, c in r["counter"].most_common(10):
        if c > 5:
            print(f"  {c}x: {s}")

    # --- HYPOTHESIS ---
    print("\nHypothesis:")

    if r["date_count"] > 1000:
        print("  >>> LIKELY: correspondence / table with dates ")
    elif r["repeat_score"] > 500:
        print("  >>> LIKELY: mapping / repeated values ")
    elif r["text_density"] > 0.7:
        print("  text blob (XML / UI / resources)")
    elif r["data_density"] > 0.95:
        print("  binary structure (table / object)")
    else:
        print("  mixed / unclear")