import subprocess, os, csv, re, html, concurrent.futures

SRC = "/opt/automator/cinik-rponse/files/raw/cinik_to_fetch.txt"
OUTDIR = "/tmp/cinik_fetch_2"
os.makedirs(OUTDIR, exist_ok=True)

with open(SRC) as f:
    lines = [l.strip() for l in f if l.strip()]
urls = []
for l in lines:
    m = re.match(r'^\d+\s+(https?://\S+)$', l)
    if m: urls.append(m.group(1))
    elif l.startswith("http"): urls.append(l)

slice_urls = [u for n,u in enumerate(urls) if n % 5 == 2]

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"

def fetch(u):
    fn = os.path.join(OUTDIR, re.sub(r'[^a-zA-Z0-9]+','_', u)[-120:] + ".html")
    status = "ERR"
    for attempt in range(2):
        try:
            r = subprocess.run(
                ["curl","-s","--max-time","30","-H","X-Return-Format: html","-A",UA,
                 "-o",fn,"-w","%{http_code}", "https://r.jina.ai/"+u],
                capture_output=True, text=True, timeout=50)
            status = r.stdout.strip()[-3:] if r.stdout.strip() else "ERR"
            if status == "200" and os.path.getsize(fn) > 500:
                break
        except Exception:
            status = "TIMEOUT"
    return (u, fn, status)

results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
    for u,fn,status in ex.map(fetch, slice_urls):
        results[u] = (fn, status)

def extract(fnhtml):
    try:
        with open(fnhtml, encoding="utf-8", errors="ignore") as f:
            h = f.read()
    except:
        return ("","","","")
    def grab(pat):
        m = re.search(pat, h, re.I|re.S)
        return html.unescape(m.group(1)).strip() if m else ""
    title = grab(r'<title[^>]*>(.*?)</title>')
    h1 = grab(r'<h1[^>]*>(.*?)</h1>')
    h1 = re.sub(r'<[^>]+>',' ', h1); h1 = re.sub(r'\s+',' ', h1).strip()
    meta = grab(r'<meta[^>]+name=["\']description["\'][^>]*content=["\'](.*?)["\']')
    if not meta:
        meta = grab(r'<meta[^>]+content=["\'](.*?)["\'][^>]*name=["\']description["\']')
    og = grab(r'<meta[^>]+property=["\']og:title["\'][^>]*content=["\'](.*?)["\']')
    if not og:
        og = grab(r'<meta[^>]+content=["\'](.*?)["\'][^>]*property=["\']og:title["\']')
    title = re.sub(r'\s+',' ', title).strip()
    return (title, h1, meta, og)

BRAND_MARKERS = ["cinik","dr cinik","hair transplant clinic","emrahcinik","dr. cinik"]
def derive(title, h1):
    base = title if title else h1
    if not base: return ""
    parts = re.split(r'\s+[\|\-–—]\s+', base)
    if len(parts) > 1:
        kept = []
        for p in parts:
            if any(b in p.lower() for b in BRAND_MARKERS):
                break
            kept.append(p)
        base = " ".join(kept).strip() if kept else parts[0].strip()
    return re.sub(r'\s+',' ', base).strip().lower()

rows = []; ok=0; fail=0
for u in slice_urls:
    fn, status = results[u]
    title,h1,meta,og = extract(fn)
    if status=="200" and (title or h1): ok+=1
    else: fail+=1
    rows.append([u,status,title,h1,meta,derive(title,h1)])

OUTCSV = "/opt/automator/cinik-rponse/files/raw/cinik_fetched_2.csv"
with open(OUTCSV,"w",newline="",encoding="utf-8") as f:
    w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    w.writerow(["url","http_status","title","h1","meta_description","derived_keyword"])
    for r in rows: w.writerow(r)

print("SLICE",len(slice_urls),"OK",ok,"FAIL",fail)
for r in rows:
    print("ROW|",r[0],"|STATUS|",r[1],"|TITLE|",r[2][:70],"|DK|",r[5])