import subprocess, os, csv, re, html, time

OUTDIR = "/tmp/cinik_fetch_2"
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
retry = [
 "https://emrahcinik.com/probiotics-hair-loss",
 "https://emrahcinik.com/rivaldo",
 "https://emrahcinik.com/spermidine-hair-growth",
 "https://emrahcinik.com/thinning-hair",
 "https://emrahcinik.com/wrong-facts-hair-transplant",
]

def fetch(u):
    fn = os.path.join(OUTDIR, re.sub(r'[^a-zA-Z0-9]+','_', u)[-120:] + ".html")
    status="ERR"
    for attempt in range(5):
        try:
            r = subprocess.run(["curl","-s","--max-time","40","-H","X-Return-Format: html","-A",UA,
                 "-o",fn,"-w","%{http_code}","https://r.jina.ai/"+u],
                capture_output=True,text=True,timeout=60)
            status = r.stdout.strip()[-3:] if r.stdout.strip() else "ERR"
            if status=="200" and os.path.getsize(fn)>500:
                return (u,fn,status)
            time.sleep(8)
        except Exception:
            status="TIMEOUT"; time.sleep(8)
    return (u,fn,status)

def extract(fnhtml):
    try:
        with open(fnhtml,encoding="utf-8",errors="ignore") as f: h=f.read()
    except: return ("","","","")
    def grab(p):
        m=re.search(p,h,re.I|re.S); return html.unescape(m.group(1)).strip() if m else ""
    title=grab(r'<title[^>]*>(.*?)</title>')
    h1=grab(r'<h1[^>]*>(.*?)</h1>'); h1=re.sub(r'<[^>]+>',' ',h1); h1=re.sub(r'\s+',' ',h1).strip()
    meta=grab(r'<meta[^>]+name=["\']description["\'][^>]*content=["\'](.*?)["\']')
    if not meta: meta=grab(r'<meta[^>]+content=["\'](.*?)["\'][^>]*name=["\']description["\']')
    og=grab(r'<meta[^>]+property=["\']og:title["\'][^>]*content=["\'](.*?)["\']')
    title=re.sub(r'\s+',' ',title).strip()
    return (title,h1,meta,og)

BRAND=["cinik","dr cinik","hair transplant clinic","emrahcinik","dr. cinik"]
def derive(title,h1):
    base=title if title else h1
    if not base: return ""
    parts=re.split(r'\s+[\|\-–—]\s+',base)
    if len(parts)>1:
        kept=[]
        for p in parts:
            if any(b in p.lower() for b in BRAND): break
            kept.append(p)
        base=" ".join(kept).strip() if kept else parts[0].strip()
    return re.sub(r'\s+',' ',base).strip().lower()

CSV="/opt/automator/cinik-rponse/files/raw/cinik_fetched_2.csv"
with open(CSV,newline="",encoding="utf-8") as f:
    rows=list(csv.reader(f))
header=rows[0]; data=rows[1:]
idx={r[0]:i for i,r in enumerate(data)}

for u in retry:
    time.sleep(5)
    _,fn,status=fetch(u)
    title,h1,meta,og=extract(fn)
    dk=derive(title,h1)
    i=idx[u]
    data[i]=[u,status,title,h1,meta,dk]
    print("RETRY|",u,"|STATUS|",status,"|TITLE|",title[:70],"|DK|",dk)

with open(CSV,"w",newline="",encoding="utf-8") as f:
    w=csv.writer(f,quoting=csv.QUOTE_MINIMAL); w.writerow(header)
    for r in data: w.writerow(r)

ok=sum(1 for r in data if r[1]=="200" and (r[2] or r[3]))
print("FINAL OK",ok,"of",len(data))
