import re, html, csv

with open('/tmp/cinik_fetch_0/urlmap.tsv') as f:
    pairs = [l.rstrip('\n').split('\t') for l in f if l.strip()]

# load statuses
status = {}
with open('/tmp/cinik_fetch_0/status.log') as f:
    for l in f:
        parts = l.split()
        if len(parts) >= 3:
            status[parts[0]] = parts[1]

def clean(s):
    if s is None:
        return ''
    s = html.unescape(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def get_title(doc):
    m = re.search(r'<title[^>]*>(.*?)</title>', doc, re.I | re.S)
    return clean(m.group(1)) if m else ''

def get_h1(doc):
    m = re.search(r'<h1[^>]*>(.*?)</h1>', doc, re.I | re.S)
    if not m:
        return ''
    inner = re.sub(r'<[^>]+>', ' ', m.group(1))
    return clean(inner)

def get_meta(doc, name=None, prop=None):
    # find all meta tags
    for m in re.finditer(r'<meta\b[^>]*>', doc, re.I):
        tag = m.group(0)
        if name:
            nm = re.search(r'name\s*=\s*["\']([^"\']+)["\']', tag, re.I)
            if nm and nm.group(1).lower() == name.lower():
                cm = re.search(r'content\s*=\s*["\'](.*?)["\']', tag, re.I | re.S)
                if cm:
                    return clean(cm.group(1))
        if prop:
            pm = re.search(r'property\s*=\s*["\']([^"\']+)["\']', tag, re.I)
            if pm and pm.group(1).lower() == prop.lower():
                cm = re.search(r'content\s*=\s*["\'](.*?)["\']', tag, re.I | re.S)
                if cm:
                    return clean(cm.group(1))
    return ''

def get_yoast_focus(doc):
    # rarely present; check meta and json
    m = re.search(r'["\'](?:focus_?kw|focuskw|focus_keyword)["\']\s*:\s*["\'](.*?)["\']', doc, re.I)
    if m:
        return clean(m.group(1))
    return ''

# brand suffix separators
SEPS = [' | ', ' - ', ' — ', ' – ', ' · ']
BRAND_MARKERS = ['cinik', 'dr cinik', 'dr. cinik', 'dr cink', 'dr. cink', 'cink', 'emrah', 'hair transplant clinic', 'emrah cinik', 'hair clinic']

def derive(title, h1):
    base = title if title else h1
    base = base.strip()
    if not base:
        return ''
    # Try splitting on separators; strip the segment(s) that contain a brand marker (usually the suffix)
    # Strategy: split into segments by any sep, keep leading segments until we hit a brand-containing segment.
    # Build a regex that splits on any sep
    pattern = re.compile(r'\s+[|\-—–·]\s+')
    segments = pattern.split(base)
    if len(segments) > 1:
        kept = []
        for seg in segments:
            low = seg.lower()
            if any(b in low for b in BRAND_MARKERS):
                break
            kept.append(seg)
        if kept:
            base = ' '.join(kept).strip()
        else:
            # all segments had brand; just take first
            base = segments[0].strip()
    out = re.sub(r'\s+', ' ', base).strip().lower()
    return out

rows = []
for url, h in pairs:
    path = '/tmp/cinik_fetch_0/%s.html' % h
    st = status.get(h, '')
    try:
        with open(path, encoding='utf-8', errors='replace') as fh:
            doc = fh.read()
    except FileNotFoundError:
        rows.append([url, st, '', '', '', ''])
        continue
    title = get_title(doc)
    h1 = get_h1(doc)
    meta_desc = get_meta(doc, name='description')
    og_title = get_meta(doc, prop='og:title')
    yoast = get_yoast_focus(doc)
    dk = derive(title, h1)
    rows.append([url, st, title, h1, meta_desc, dk, og_title, yoast])

# write CSV (only the 6 required columns)
with open('/opt/automator/cinik-rponse/files/raw/cinik_fetched_0.csv', 'w', newline='', encoding='utf-8') as out:
    w = csv.writer(out, quoting=csv.QUOTE_MINIMAL)
    w.writerow(['url','http_status','title','h1','meta_description','derived_keyword'])
    for r in rows:
        w.writerow(r[:6])

# print diagnostic
for r in rows:
    print('URL:', r[0])
    print('  status:', r[1])
    print('  title :', r[2])
    print('  h1    :', r[3])
    print('  ogt   :', r[6] if len(r)>6 else '')
    print('  yoast :', r[7] if len(r)>7 else '')
    print('  DERIV :', r[5])
    print()
