import subprocess, os, csv, re, html, concurrent.futures

SRC = "/opt/automator/cinik-rponse/files/raw/cinik_to_fetch.txt"
OUTDIR = "/tmp/cinik_fetch_2"
os.makedirs(OUTDIR, exist_ok=True)

with open(SRC) as f:
    lines = [l.strip() for l in f if l.strip()]
# strip leading line numbers if cat -n style? file is plain URLs
urls = []
for l in lines:
    # remove possible "N\t" prefix
    m = re.match(r'^\d+\s+(https?://\S+)$', l)
    if m:
        urls.append(m.group(1))
    elif l.startswith("http"):
        urls.append(l)

slice_urls = [u for n,u in enumerate(urls) if n % 5 == 2]
print("TOTAL_URLS", len(urls))
print("SLICE_SIZE", len(slice_urls))
for u in slice_urls:
    print("SLICE", u)

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"

def fetch(u):
    fn = os.path.join(OUTDIR, re.sub(r'[^a-zA-Z0-9]+','_', u)[-120:] + ".html")
    # get status and body
    try:
        r = subprocess.run(["curl","-sL","-A",UA,"--max-time","25","-w","\n__HTTP_STATUS__%{http_code}","-o",fn, u],
                           capture_output=True, text=True, timeout=40)
    except Exception as e:
        return (u, "ERR", fn, str(e))
    # status code via separate -w to stdout won't work with -o; use -I-less approach: re-run head
    return (u, fn)

# fetch with status capture
def fetch2(u):
    fn = os.path.join(OUTDIR, re.sub(r'[^a-zA-Z0-9]+','_', u)[-120:] + ".html")
    try:
        r = subprocess.run(["curl","-sL","-A",UA,"--max-time","25","-w","%{http_code}","-o",fn, u],
                           capture_output=True, text=True, timeout=45)
        status = r.stdout.strip()[-3:] if r.stdout.strip() else "ERR"
    except Exception as e:
        status = "TIMEOUT"
    return (u, fn, status)

results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
    for u,fn,status in ex.map(fetch2, slice_urls):
        results[u] = (fn, status)

def extract(fnhtml):
    try:
        with open(fnhtml, encoding="utf-8", errors="ignore") as f:
            h = f.read()
    except:
        return ("","","","")
    def grab(pat):
        m = re.search(pat, h, re.I|re.S)
        return html.unescape(m.group(1)).strip() if m else ""
    title = grab(r'<title[^>]*>(.*?)</title>')
    h1 = grab(r'<h1[^>]*>(.*?)</h1>')
    h1 = re.sub(r'<[^>]+>',' ', h1)
    h1 = re.sub(r'\s+',' ', h1).strip()
    meta = grab(r'<meta[^>]+name=["\']description["\'][^>]*content=["\'](.*?)["\']')
    if not meta:
        meta = grab(r'<meta[^>]+content=["\'](.*?)["\'][^>]*name=["\']description["\']')
    og = grab(r'<meta[^>]+property=["\']og:title["\'][^>]*content=["\'](.*?)["\']')
    if not og:
        og = grab(r'<meta[^>]+content=["\'](.*?)["\'][^>]*property=["\']og:title["\']')
    title = re.sub(r'\s+',' ', title).strip()
    return (title, h1, meta, og)

BRAND_MARKERS = ["cinik","dr cinik","hair transplant clinic","emrahcinik","dr. cinik"]
def derive(title, h1):
    base = title if title else h1
    if not base:
        return ""
    # split on separators, drop the trailing brand part
    parts = re.split(r'\s+[\|\-–—]\s+', base)
    if len(parts) > 1:
        # keep parts until we hit one containing a brand marker
        kept = []
        for p in parts:
            pl = p.lower()
            if any(b in pl for b in BRAND_MARKERS):
                break
            kept.append(p)
        if kept:
            base = " ".join(kept).strip()
        else:
            base = parts[0].strip()
    base = re.sub(r'\s+',' ', base).strip().lower()
    return base

rows = []
ok = 0; fail = 0
for u in slice_urls:
    fn, status = results[u]
    title, h1, meta, og = extract(fn)
    if status == "200" and (title or h1):
        ok += 1
    else:
        fail += 1
    dk = derive(title, h1)
    rows.append([u, status, title, h1, meta, dk])

OUTCSV = "/opt/automator/cinik-rponse/files/raw/cinik_fetched_2.csv"
with open(OUTCSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    w.writerow(["url","http_status","title","h1","meta_description","derived_keyword"])
    for r in rows:
        w.writerow(r)

print("OK", ok, "FAIL", fail)
for r in rows:
    print("ROW", r[0], "|STATUS|", r[1], "|DK|", r[5])
