#!/usr/bin/env python3
import csv, html, os, re, json

URLS = [l.strip() for l in open('/tmp/cinik_fetch_4/urls.txt') if l.strip()]

# status map from fetch_status + retries: we determine status by re-reading status file
# We'll recompute status by checking saved html non-empty + a status override dict.
STATUS_ALL = {}  # url -> list of observed status codes (strings)
def load_status(path):
    if not os.path.exists(path): return
    for line in open(path):
        parts = line.split()
        if len(parts) >= 2 and parts[0].isdigit() and parts[1].startswith('http'):
            STATUS_ALL.setdefault(parts[1], []).append(parts[0])

load_status('/tmp/cinik_fetch_4/fetch_status.txt')
load_status('/tmp/cinik_fetch_4/retry_status.txt')

def best_status(url):
    codes = STATUS_ALL.get(url, [])
    # prefer the most informative observed code: any 2xx, else 3xx, else first non-000, else 000
    for c in codes:
        if c.startswith('2'): return c
    for c in codes:
        if c.startswith('3'): return c
    for c in codes:
        if c != '000': return c
    return codes[0] if codes else ''

STATUS_OVERRIDE = {u: best_status(u) for u in STATUS_ALL}

def slugify(url):
    return url.replace('https://emrahcinik.com/','').replace('/','__')

def extract(tagpat, htmltext, flags=re.I|re.S):
    m = re.search(tagpat, htmltext, flags)
    return m.group(1).strip() if m else ''

def clean(s):
    s = html.unescape(s)
    s = re.sub(r'<[^>]+>', '', s)        # strip any nested tags
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def get_meta(htmltext, name=None, prop=None):
    # try name= or property= in either attribute order
    pats = []
    if name:
        pats.append(r'<meta[^>]*name=["\']'+re.escape(name)+r'["\'][^>]*content=["\'](.*?)["\']')
        pats.append(r'<meta[^>]*content=["\'](.*?)["\'][^>]*name=["\']'+re.escape(name)+r'["\']')
    if prop:
        pats.append(r'<meta[^>]*property=["\']'+re.escape(prop)+r'["\'][^>]*content=["\'](.*?)["\']')
        pats.append(r'<meta[^>]*content=["\'](.*?)["\'][^>]*property=["\']'+re.escape(prop)+r'["\']')
    for p in pats:
        m = re.search(p, htmltext, re.I|re.S)
        if m:
            return clean(m.group(1))
    return ''

BRAND_MARKERS = ['cinik', 'dr cinik', 'hair transplant clinic', 'emrah cinik', 'hair clinic', 'hairclinic']
SEPS = [' | ', ' - ', ' — ', ' – ', ' • ']

def derive_keyword(title, h1):
    base = title.strip()
    if not base:
        base = h1.strip()
    # strip brand suffix: find first separator whose RIGHT side contains a brand marker
    if base:
        # find earliest separator position where the tail contains a brand marker
        best_pos = None
        for sep in SEPS:
            idx = 0
            while True:
                pos = base.find(sep, idx)
                if pos == -1:
                    break
                tail = base[pos+len(sep):].lower()
                if any(mk in tail for mk in BRAND_MARKERS):
                    if best_pos is None or pos < best_pos:
                        best_pos = pos
                    break
                idx = pos + len(sep)
        if best_pos is not None:
            base = base[:best_pos]
        else:
            # fallback: if any separator exists and the whole title contains brand at the end segment,
            # cut at last separator if the last segment looks like brand
            for sep in SEPS:
                if sep in base:
                    head, _, tail = base.rpartition(sep)
                    if any(mk in tail.lower() for mk in BRAND_MARKERS):
                        base = head
                        break
    kw = re.sub(r'\s+', ' ', base).strip().lower()
    return kw

rows = []
ok = 0
failed = 0
for url in URLS:
    slug = slugify(url)
    path = f'/tmp/cinik_fetch_4/{slug}.html'
    status = STATUS_OVERRIDE.get(url, '')
    title = h1 = meta_desc = ''
    htmltext = ''
    if os.path.exists(path) and os.path.getsize(path) > 200:
        htmltext = open(path, encoding='utf-8', errors='replace').read()
    if htmltext and ('<html' in htmltext.lower() or '<title' in htmltext.lower()):
        title = clean(extract(r'<title[^>]*>(.*?)</title>', htmltext))
        h1 = clean(extract(r'<h1[^>]*>(.*?)</h1>', htmltext))
        meta_desc = get_meta(htmltext, name='description')
        ogt = get_meta(htmltext, prop='og:title')
        # consider it ok if we got a title or h1
        if title or h1:
            ok += 1
            if not status or status == '000':
                status = '200'
        else:
            failed += 1
            if not status: status = '000'
    else:
        failed += 1
        if not status: status = '000'
    derived = derive_keyword(title, h1)
    rows.append([url, status, title, h1, meta_desc, derived])

with open('/opt/automator/cinik-rponse/files/raw/cinik_fetched_4.csv','w', newline='', encoding='utf-8') as f:
    w = csv.writer(f)
    w.writerow(['url','http_status','title','h1','meta_description','derived_keyword'])
    w.writerows(rows)

print('OK pages:', ok, 'FAILED:', failed)
for r in rows:
    print(r[1], '|', r[0], '->', repr(r[5]))
