{"version":"27e90b064d21","files":{"clipper.py":"\"\"\"AI clip finder for Clip Wizard.\n\nGroups the (editable) transcript into paragraphs using the natural pauses in\nspeech, then asks a local Ollama model which run of paragraphs makes the best\nshorts. Clips start at a paragraph beginning (full setup) and end on a clean\nboundary. Uses the edited transcript text, so corrections flow through. Local,\nno API key, no cost.\n\"\"\"\n\nimport json\nimport os\nimport threading\nimport urllib.request\n\nOLLAMA_URL = \"http://localhost:11434/api/chat\"\nMODEL = os.environ.get(\"CLIPWIZARD_LLM\", \"qwen2.5:14b\")\n\n# Progress the API reads while the AI is thinking.\njob: dict = {\n    \"state\": \"idle\",      # idle | running | done | error\n    \"message\": \"\",\n    \"error\": None,\n}\n\n# The clips the AI suggested (or None until done).\nclips: list | None = None\n\n# Optional callback the app sets, called when clip-finding finishes (to save).\non_complete = None\n\n_lock = threading.Lock()\n\n# Length limits. Max 3 minutes (YouTube Shorts' actual max) so AI clips rarely\n# get cut. Manual clips (manual_clip) have NO cap — the user's choice rules.\nMIN_CLIP_SECONDS = 12\nMAX_CLIP_SECONDS = 180\n\n# The most clips we'll ever make from one video (the user picks within this).\nMAX_CLIPS = 20\nDEFAULT_CLIPS = 5\n\n# A pause longer than this (seconds) between sentences marks a new paragraph.\nPARAGRAPH_PAUSE_SECONDS = 0.6\n# Also cap paragraph size so we still get usable chunks even when there are no\n# pauses to split on (e.g. a video that's already been tidied).\nMAX_PARA_SENTENCES = 6\n\n# A tiny lead-in so the very first word is never clipped at the start.\nLEAD_IN_SECONDS = 0.15\n\nSYSTEM_PROMPT = (\n    \"You are an expert short-form video editor. Below is ONE SECTION of a longer video, \"\n    \"split into numbered PARAGRAPHS (each is one thought, with its length in seconds).\\n\\n\"\n    \"Choose the SINGLE best self-contained moment in this section to turn into a vertical \"\n    \"short (YouTube Shorts, TikTok, Reels). Give a start_paragraph and end_paragraph using \"\n    \"the numbers shown — the same number for one paragraph, or a few consecutive numbers if \"\n    \"together they form one complete, compelling moment. Always include the setup so it \"\n    \"makes sense on its own; never start mid-point.\\n\\n\"\n    \"Prefer a strong hook, a useful tip, a story, a bold opinion, or an emotional beat.\\n\\n\"\n    \"Return ONLY JSON in exactly this shape (one clip, or none if nothing is worth it):\\n\"\n    '{ \"clips\": [ { \"start_paragraph\": <int>, \"end_paragraph\": <int>, '\n    '\"title\": \"<short catchy title>\", \"reason\": \"<one short sentence>\" } ] }\\n'\n    \"Do not invent text that is not in the transcript.\"\n)\n\n\ndef _sentence_units(transcript: dict) -> list:\n    \"\"\"The transcript lines (already whole sentences), using the EDITED text.\"\"\"\n    return [\n        {\"start\": s[\"start\"], \"end\": s[\"end\"], \"text\": s[\"text\"]}\n        for s in transcript[\"segments\"]\n    ]\n\n\ndef _make_paragraph(units: list, s: int, e: int) -> dict:\n    return {\n        \"sent_start\": s,\n        \"sent_end\": e,\n        \"start\": units[s][\"start\"],\n        \"end\": units[e][\"end\"],\n        \"text\": \" \".join(units[i][\"text\"] for i in range(s, e + 1)).strip(),\n    }\n\n\ndef _build_paragraphs(units: list) -> list:\n    \"\"\"Group sentence lines into paragraphs using the pauses in speech.\"\"\"\n    if not units:\n        return []\n    paragraphs = []\n    group_start = 0\n    for i in range(1, len(units)):\n        gap = units[i][\"start\"] - units[i - 1][\"end\"]\n        if gap > PARAGRAPH_PAUSE_SECONDS or (i - group_start) >= MAX_PARA_SENTENCES:\n            paragraphs.append(_make_paragraph(units, group_start, i - 1))\n            group_start = i\n    paragraphs.append(_make_paragraph(units, group_start, len(units) - 1))\n    return paragraphs\n\n\ndef _build_numbered_paragraphs(paragraphs: list) -> str:\n    lines = []\n    for i, p in enumerate(paragraphs):\n        length = p[\"end\"] - p[\"start\"]\n        lines.append(f\"[{i}] ({length:.0f}s) {p['text']}\")\n    return \"\\n\".join(lines)\n\n\ndef _clip_from_range(units: list, paragraphs: list, si: int, ei: int):\n    \"\"\"Build a clip spanning paragraphs si..ei, kept to a valid length.\n\n    Starts at the first paragraph's beginning (context). Trims whole sentences\n    off the end if too long; extends with following whole paragraphs if too short.\n    Returns a clip dict or None.\n    \"\"\"\n    if si > ei:\n        si, ei = ei, si\n    a = paragraphs[si][\"sent_start\"]\n    b = paragraphs[ei][\"sent_end\"]\n\n    # Too long: trim trailing sentences (still ends on a clean boundary).\n    while b > a and (units[b][\"end\"] - units[a][\"start\"]) > MAX_CLIP_SECONDS:\n        b -= 1\n\n    # Too short: pull in following whole paragraphs while they still fit.\n    nxt = ei + 1\n    while (units[b][\"end\"] - units[a][\"start\"]) < MIN_CLIP_SECONDS and nxt < len(paragraphs):\n        candidate_end = paragraphs[nxt][\"sent_end\"]\n        if (units[candidate_end][\"end\"] - units[a][\"start\"]) <= MAX_CLIP_SECONDS:\n            b = candidate_end\n            nxt += 1\n        else:\n            break\n\n    duration = units[b][\"end\"] - units[a][\"start\"]\n    if duration < MIN_CLIP_SECONDS or duration > MAX_CLIP_SECONDS:\n        return None\n\n    start = max(0.0, units[a][\"start\"] - LEAD_IN_SECONDS)\n    end = units[b][\"end\"]\n    text = \" \".join(units[i][\"text\"] for i in range(a, b + 1)).strip()\n    return {\n        \"start\": start,\n        \"end\": end,\n        \"duration\": round(end - start, 1),\n        \"title\": \"Untitled clip\",\n        \"reason\": \"\",\n        \"text\": text,\n        \"sent_start\": a,   # which sentence the clip starts on (for nudging)\n        \"sent_end\": b,     # which sentence the clip ends on\n        \"kept\": True,      # keep/drop status (Part 6)\n    }\n\n\ndef _overlaps(clip: dict, chosen: list) -> bool:\n    return any(clip[\"start\"] < c[\"end\"] and clip[\"end\"] > c[\"start\"] for c in chosen)\n\n\ndef _numbered(paragraphs: list, indices: list) -> str:\n    lines = []\n    for gi in indices:\n        p = paragraphs[gi]\n        length = p[\"end\"] - p[\"start\"]\n        lines.append(f\"[{gi}] ({length:.0f}s) {p['text']}\")\n    return \"\\n\".join(lines)\n\n\ndef _ask_window(units: list, paragraphs: list, indices: list, temperature: float = 0.3) -> list:\n    \"\"\"Ask the AI for the best clip among one section's paragraphs.\"\"\"\n    numbered = _numbered(paragraphs, indices)\n    approx_tokens = int(len(numbered.split()) * 1.6) + 1500\n    num_ctx = max(4096, min(16384, approx_tokens))\n\n    payload = {\n        \"model\": MODEL,\n        \"messages\": [\n            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n            {\"role\": \"user\", \"content\": f\"Here is the section:\\n\\n{numbered}\"},\n        ],\n        \"format\": \"json\",\n        \"stream\": False,\n        \"options\": {\"temperature\": temperature, \"num_ctx\": num_ctx},\n    }\n    data = json.dumps(payload).encode(\"utf-8\")\n    req = urllib.request.Request(OLLAMA_URL, data=data, headers={\"Content-Type\": \"application/json\"})\n    with urllib.request.urlopen(req, timeout=300) as resp:\n        body = json.loads(resp.read().decode(\"utf-8\"))\n\n    content = body.get(\"message\", {}).get(\"content\", \"{}\")\n    try:\n        parsed = json.loads(content)\n    except Exception:\n        return []\n    raw_clips = parsed.get(\"clips\", [])\n\n    valid = set(indices)\n    out = []\n    for c in raw_clips:\n        try:\n            si = int(c[\"start_paragraph\"])\n            ei = int(c.get(\"end_paragraph\", c[\"start_paragraph\"]))\n        except (KeyError, ValueError, TypeError):\n            continue\n        if si not in valid or not (0 <= ei < len(paragraphs)):\n            continue\n        clip = _clip_from_range(units, paragraphs, si, ei)\n        if clip is None:\n            continue\n        clip[\"title\"] = str(c.get(\"title\", \"Untitled clip\")).strip()\n        clip[\"reason\"] = str(c.get(\"reason\", \"\")).strip()\n        out.append(clip)\n    return out\n\n\ndef _run(transcript: dict, count: int):\n    global clips\n    try:\n        job.update(state=\"running\", message=\"AI is finding the best moments...\", error=None)\n        existing = list(clips) if clips else []   # keep shorts the user already made\n        units = _sentence_units(transcript)\n        paragraphs = _build_paragraphs(units)\n        if not paragraphs:\n            clips = existing\n            job.update(state=\"done\", message=\"No clips found.\")\n            if on_complete:\n                on_complete()\n            return\n\n        # Split the video into `count` equal sections and take the best clip from\n        # each — so clips are spread across the whole length, however long.\n        n = max(1, min(MAX_CLIPS, int(count)))\n        start_t = units[0][\"start\"]\n        end_t = units[-1][\"end\"]\n        win = max(0.1, end_t - start_t) / n\n\n        new_clips: list = []\n        for k in range(n):\n            ws = start_t + k * win\n            we = (start_t + (k + 1) * win) if k < n - 1 else (end_t + 1.0)\n            idxs = [gi for gi, p in enumerate(paragraphs)\n                    if ws <= ((p[\"start\"] + p[\"end\"]) / 2.0) < we]\n            if not idxs:\n                continue\n            for c in _ask_window(units, paragraphs, idxs):\n                # don't overlap shorts the user already has OR ones found this run\n                if not _overlaps(c, existing + new_clips):\n                    new_clips.append(c)\n                    break  # one new clip per section\n            job[\"message\"] = f\"Finding the best moments... ({len(new_clips)} new so far)\"\n\n        new_clips.sort(key=lambda c: c[\"start\"])\n        # Keep existing clips at their positions (so already-rendered shorts +\n        # thumbnails stay aligned by index); add the new picks after them. Do the\n        # write under the lock and preserve anything added (e.g. a manual clip)\n        # while the AI was thinking, so a concurrent manual-clip isn't lost.\n        with _lock:\n            live = list(clips) if clips else []\n            added_during = live[len(existing):] if len(live) >= len(existing) else []\n            clips = existing + added_during + new_clips\n        job.update(state=\"done\", message=f\"Found {len(new_clips)} new clips.\")\n        if on_complete:\n            on_complete()\n    except Exception as e:  # noqa: BLE001\n        job.update(state=\"error\", error=str(e), message=\"The AI step failed.\")\n\n\ndef _rebuild_clip(clip: dict, units: list):\n    \"\"\"Recompute a clip's times and text from its sentence range.\"\"\"\n    a = clip[\"sent_start\"]\n    b = clip[\"sent_end\"]\n    clip[\"start\"] = max(0.0, units[a][\"start\"] - LEAD_IN_SECONDS)\n    clip[\"end\"] = units[b][\"end\"]\n    clip[\"duration\"] = round(clip[\"end\"] - clip[\"start\"], 1)\n    clip[\"text\"] = \" \".join(units[i][\"text\"] for i in range(a, b + 1)).strip()\n\n\ndef adjust_clip(index: int, action: str, transcript: dict):\n    \"\"\"Part 6: keep/drop a clip, or nudge its start/end by one sentence.\n\n    Actions: \"keep\", \"drop\", \"start_earlier\", \"start_later\",\n    \"end_earlier\", \"end_later\". Returns the updated clip, or None on bad input.\n    \"\"\"\n    if clips is None or index < 0 or index >= len(clips):\n        return None\n    clip = clips[index]\n\n    if action == \"drop\":\n        clip[\"kept\"] = False\n        return clip\n    if action == \"keep\":\n        clip[\"kept\"] = True\n        return clip\n\n    units = _sentence_units(transcript)\n    last = len(units) - 1\n    a = clip[\"sent_start\"]\n    b = clip[\"sent_end\"]\n\n    if action == \"start_earlier\" and a > 0:\n        a -= 1\n    elif action == \"start_later\" and a < b:\n        a += 1\n    elif action == \"end_later\" and b < last:\n        b += 1\n    elif action == \"end_earlier\" and b > a:\n        b -= 1\n    else:\n        return clip  # at a boundary — nothing to do\n\n    # Don't let a nudge push it over the max length.\n    if (units[b][\"end\"] - units[a][\"start\"]) > MAX_CLIP_SECONDS:\n        return clip\n\n    clip[\"sent_start\"] = a\n    clip[\"sent_end\"] = b\n    _rebuild_clip(clip, units)\n    return clip\n\n\ndef manual_clip(transcript: dict, start: float, end: float, title: str | None = None):\n    \"\"\"Build a clip from a hand-picked time range and return it.\"\"\"\n    units = _sentence_units(transcript)\n    if not units or end <= start:\n        return None\n    # Which sentences overlap the chosen range (for captions / editing).\n    sent_start = None\n    sent_end = None\n    for i, u in enumerate(units):\n        if u[\"end\"] > start and sent_start is None:\n            sent_start = i\n        if u[\"start\"] < end:\n            sent_end = i\n    if sent_start is None:\n        sent_start = 0\n    if sent_end is None or sent_end < sent_start:\n        sent_end = sent_start\n    text = \" \".join(units[i][\"text\"] for i in range(sent_start, sent_end + 1)).strip()\n    return {\n        \"start\": max(0.0, float(start)),\n        \"end\": float(end),\n        \"duration\": round(float(end) - float(start), 1),\n        \"title\": (title or \"My clip\").strip(),\n        \"reason\": \"Hand-picked section\",\n        \"text\": text,\n        \"sent_start\": sent_start,\n        \"sent_end\": sent_end,\n        \"kept\": True,\n    }\n\n\ndef start(transcript: dict, count: int = DEFAULT_CLIPS):\n    \"\"\"Kick off clip-finding in the background. `count` = how many shorts to make.\"\"\"\n    global clips\n    with _lock:\n        if job[\"state\"] == \"running\":\n            return False\n        # Do NOT clear existing clips — find-clips ADDS new picks and keeps the\n        # shorts the user has already made.\n        job.update(state=\"running\", message=\"Starting...\", error=None)\n    threading.Thread(target=_run, args=(transcript, count), daemon=True).start()\n    return True\n","main.py":"\"\"\"Clip Wizard backend — the 'brain'.\n\nPart 2: remember which video the user picked.\nReal video work (transcribe, find clips, render) comes later.\n\"\"\"\n\nimport os\nimport subprocess\nimport sys\n\nfrom fastapi import FastAPI, HTTPException\nfrom fastapi.middleware.cors import CORSMiddleware\nfrom pydantic import BaseModel\n\nimport transcriber\nimport clipper\nimport renderer\nimport tidy\nimport store\nimport youtube\nimport thumbnail\n\nfrom fastapi.responses import FileResponse\n\napp = FastAPI(title=\"Clip Wizard Backend\")\n\n# Allow the desktop app's window to talk to this backend.\napp.add_middleware(\n    CORSMiddleware,\n    allow_origins=[\"*\"],\n    allow_methods=[\"*\"],\n    allow_headers=[\"*\"],\n)\n\n# Video file extensions we accept.\nVIDEO_EXTENSIONS = {\".mp4\", \".mov\", \".mkv\", \".avi\", \".webm\", \".m4v\"}\n\n# The single video the user is currently working on (Part 2 keeps it simple).\ncurrent_video: dict | None = None\n\n\ndef _persist():\n    \"\"\"Save the current work so it survives closing/reopening the app.\"\"\"\n    try:\n        store.save_state({\n            \"current_video\": current_video,\n            \"transcript\": transcriber.transcript,\n            \"clips\": clipper.clips,\n        })\n    except Exception:\n        pass\n\n\n# Save automatically when a transcription or clip-find finishes.\ntranscriber.on_complete = _persist\nclipper.on_complete = _persist\n\n\ndef _set_downloaded_video(r: dict):\n    \"\"\"When a YouTube download finishes, switch the app to that video.\"\"\"\n    global current_video\n    current_video = {\"path\": r[\"path\"], \"name\": r[\"name\"], \"size\": human_size(r[\"size\"])}\n    transcriber.transcript = None\n    clipper.clips = None\n    _persist()\n\n\nyoutube.on_complete = _set_downloaded_video\n\n# Restore last session's work on startup (only if the video file still exists).\n_saved = store.load_state()\nif _saved:\n    _cv = _saved.get(\"current_video\")\n    if _cv and _cv.get(\"path\") and os.path.isfile(_cv[\"path\"]):\n        current_video = _cv\n        transcriber.transcript = _saved.get(\"transcript\")\n        clipper.clips = _saved.get(\"clips\")\n\n\nclass ImportRequest(BaseModel):\n    path: str\n\n\ndef human_size(num_bytes: int) -> str:\n    \"\"\"Turn bytes into something readable like '1.2 GB'.\"\"\"\n    size = float(num_bytes)\n    for unit in (\"bytes\", \"KB\", \"MB\", \"GB\", \"TB\"):\n        if size < 1024 or unit == \"TB\":\n            return f\"{size:.0f} {unit}\" if unit == \"bytes\" else f\"{size:.1f} {unit}\"\n        size /= 1024\n    return f\"{size:.1f} TB\"\n\n\n@app.get(\"/health\")\ndef health():\n    \"\"\"A simple 'I'm alive' check.\"\"\"\n    return {\"status\": \"ok\", \"message\": \"Clip Wizard brain is running\"}\n\n\n@app.post(\"/import\")\ndef import_video(req: ImportRequest):\n    \"\"\"Record the video the user picked and return basic info about it.\"\"\"\n    global current_video\n\n    path = req.path\n    if not os.path.isfile(path):\n        raise HTTPException(status_code=400, detail=\"That file could not be found.\")\n\n    ext = os.path.splitext(path)[1].lower()\n    if ext not in VIDEO_EXTENSIONS:\n        raise HTTPException(status_code=400, detail=\"That doesn't look like a video file.\")\n\n    current_video = {\n        \"path\": path,\n        \"name\": os.path.basename(path),\n        \"size\": human_size(os.path.getsize(path)),\n    }\n    # A new video clears any previous transcript/clips.\n    transcriber.transcript = None\n    clipper.clips = None\n    _persist()\n    return current_video\n\n\n@app.get(\"/current-video\")\ndef get_current_video():\n    \"\"\"Return the currently loaded video, or null if none yet.\"\"\"\n    return current_video\n\n\nclass YoutubeRequest(BaseModel):\n    url: str\n\n\n@app.post(\"/import-youtube\")\ndef import_youtube(req: YoutubeRequest):\n    \"\"\"Download a video from a YouTube link, then load it as the current video.\"\"\"\n    url = req.url.strip()\n    if not url:\n        raise HTTPException(status_code=400, detail=\"Paste a YouTube link first.\")\n    if not youtube.start(url):\n        raise HTTPException(status_code=409, detail=\"Already downloading a video.\")\n    return {\"started\": True}\n\n\n@app.get(\"/import-youtube/status\")\ndef import_youtube_status():\n    \"\"\"Progress of the current YouTube download.\"\"\"\n    return youtube.job\n\n\n@app.post(\"/transcribe\")\ndef start_transcribe():\n    \"\"\"Start turning the loaded video's speech into text (runs in background).\"\"\"\n    if current_video is None:\n        raise HTTPException(status_code=400, detail=\"Load a video first.\")\n    clipper.clips = None  # a fresh transcript invalidates old clips\n    started = transcriber.start(current_video[\"path\"])\n    if not started:\n        raise HTTPException(status_code=409, detail=\"Already transcribing.\")\n    return {\"started\": True}\n\n\n@app.get(\"/transcribe/status\")\ndef transcribe_status():\n    \"\"\"Progress of the current transcription (for the progress bar).\"\"\"\n    return transcriber.job\n\n\n@app.get(\"/transcript\")\ndef get_transcript():\n    \"\"\"The finished transcript, or null if not ready yet.\"\"\"\n    return transcriber.transcript\n\n\nclass TranscriptUpdate(BaseModel):\n    segments: list[str]  # the edited text for each line, in order\n\n\n@app.put(\"/transcript\")\ndef update_transcript(upd: TranscriptUpdate):\n    \"\"\"Save the user's edits to the transcript (fixing wrong words).\"\"\"\n    if transcriber.transcript is None:\n        raise HTTPException(status_code=400, detail=\"No transcript to edit yet.\")\n    segs = transcriber.transcript[\"segments\"]\n    if len(upd.segments) != len(segs):\n        raise HTTPException(status_code=400, detail=\"Edited transcript doesn't match.\")\n    for seg, new_text in zip(segs, upd.segments):\n        seg[\"text\"] = new_text.strip()\n    transcriber.transcript[\"text\"] = \" \".join(s[\"text\"] for s in segs).strip()\n    transcriber.transcript[\"edited\"] = True\n    _persist()\n    return transcriber.transcript\n\n\n@app.post(\"/find-clips\")\ndef start_find_clips(count: int = 5):\n    \"\"\"Ask the local AI to pick the best moments. `count` = how many shorts (1-20).\"\"\"\n    if transcriber.transcript is None:\n        raise HTTPException(status_code=400, detail=\"Transcribe the video first.\")\n    count = max(1, min(20, count))\n    started = clipper.start(transcriber.transcript, count)\n    if not started:\n        raise HTTPException(status_code=409, detail=\"Already finding clips.\")\n    return {\"started\": True}\n\n\n@app.get(\"/find-clips/status\")\ndef find_clips_status():\n    \"\"\"Progress of the AI clip-finding step.\"\"\"\n    return clipper.job\n\n\n@app.get(\"/clips\")\ndef get_clips():\n    \"\"\"The clips the AI suggested, or null if not ready.\"\"\"\n    return clipper.clips\n\n\nclass ClipAction(BaseModel):\n    action: str  # keep | drop | start_earlier | start_later | end_earlier | end_later\n\n\n@app.post(\"/clips/{index}/adjust\")\ndef adjust_clip(index: int, body: ClipAction):\n    \"\"\"Part 6: keep/drop a clip or nudge its start/end by one sentence.\"\"\"\n    if transcriber.transcript is None:\n        raise HTTPException(status_code=400, detail=\"No transcript loaded.\")\n    result = clipper.adjust_clip(index, body.action, transcriber.transcript)\n    if result is None:\n        raise HTTPException(status_code=400, detail=\"Could not adjust that clip.\")\n    _persist()\n    return clipper.clips\n\n\n@app.get(\"/clips/{index}/tighten-preview\")\ndef tighten_preview(index: int, strength: str = \"medium\"):\n    \"\"\"Part 6B: preview what tightening would remove (pauses + fillers).\"\"\"\n    if transcriber.transcript is None or clipper.clips is None:\n        raise HTTPException(status_code=400, detail=\"No clips yet.\")\n    if index < 0 or index >= len(clipper.clips):\n        raise HTTPException(status_code=400, detail=\"No such clip.\")\n    import tightener\n    src = current_video[\"path\"] if current_video else None\n    strength = strength if strength in (\"gentle\", \"medium\", \"strong\") else \"medium\"\n    info = tightener.compute(transcriber.transcript, clipper.clips[index], True, src, strength)\n    return {\n        \"filler_count\": info[\"filler_count\"],\n        \"pause_count\": info[\"pause_count\"],\n        \"removed_seconds\": info[\"removed_seconds\"],\n        \"original_duration\": info[\"original_duration\"],\n        \"new_duration\": info[\"new_duration\"],\n    }\n\n\n@app.post(\"/render/{index}\")\ndef render_clip(index: int, tighten: bool = False, layout: str = \"crop\", strength: str = \"medium\", cta: str = \"\"):\n    \"\"\"Part 7: render one clip into a vertical MP4 (optionally tightened).\n\n    layout: \"crop\" (talking-head, follows the face) or \"fit\" (whole frame on a\n    blurred fill — for screen recordings). strength: gentle|medium|strong (tighten).\n    cta: optional end-card text appended to the end of the short (empty = none).\n    \"\"\"\n    if current_video is None:\n        raise HTTPException(status_code=400, detail=\"Load a video first.\")\n    if clipper.clips is None or index < 0 or index >= len(clipper.clips):\n        raise HTTPException(status_code=400, detail=\"No such clip.\")\n    layout = layout if layout in (\"crop\", \"fit\") else \"crop\"\n    strength = strength if strength in (\"gentle\", \"medium\", \"strong\") else \"medium\"\n    cta = (cta or \"\").strip()[:200]  # cap length; empty = no end card\n    started = renderer.start(\n        current_video[\"path\"], clipper.clips[index], index, transcriber.transcript, tighten, layout, strength, cta\n    )\n    if not started:\n        raise HTTPException(status_code=409, detail=\"Already rendering — one at a time.\")\n    return {\"started\": True}\n\n\n@app.get(\"/render/status\")\ndef render_status():\n    \"\"\"Progress of the current render.\"\"\"\n    return renderer.job\n\n\n@app.post(\"/tidy\")\ndef start_tidy(strength: str = \"medium\"):\n    \"\"\"Tidy the WHOLE loaded video (remove pauses + fillers across its length).\"\"\"\n    if current_video is None:\n        raise HTTPException(status_code=400, detail=\"Load a video first.\")\n    if transcriber.transcript is None:\n        raise HTTPException(status_code=400, detail=\"Transcribe the video first.\")\n    strength = strength if strength in (\"off\", \"gentle\", \"medium\", \"strong\") else \"medium\"\n    started = tidy.start(current_video[\"path\"], transcriber.transcript, strength)\n    if not started:\n        raise HTTPException(status_code=409, detail=\"Already tidying.\")\n    return {\"started\": True}\n\n\n@app.get(\"/tidy/status\")\ndef tidy_status():\n    \"\"\"Progress of the whole-video tidy.\"\"\"\n    return tidy.job\n\n\n@app.post(\"/tidy/apply\")\ndef tidy_apply():\n    \"\"\"Switch over to the cleaned video + its re-timed transcript, ready to make shorts.\"\"\"\n    global current_video\n    if tidy.result is None:\n        raise HTTPException(status_code=400, detail=\"No tidied video to use.\")\n    r = tidy.result\n    current_video = {\"path\": r[\"path\"], \"name\": r[\"name\"], \"size\": r[\"size\"]}\n    transcriber.transcript = r[\"transcript\"]\n    clipper.clips = None  # shorts must be re-found from the cleaned video\n    _persist()\n    return {\"video\": current_video, \"new_duration\": r[\"new_duration\"]}\n\n\n@app.get(\"/source\")\ndef get_source():\n    \"\"\"Serve the currently loaded video so the app can play/scrub it.\"\"\"\n    if current_video is None or not os.path.isfile(current_video[\"path\"]):\n        raise HTTPException(status_code=404, detail=\"No video loaded.\")\n    return FileResponse(current_video[\"path\"], media_type=\"video/mp4\")\n\n\nclass ManualClip(BaseModel):\n    start: float\n    end: float\n    title: str | None = None\n\n\n@app.post(\"/manual-clip\")\ndef add_manual_clip(body: ManualClip):\n    \"\"\"Add a hand-picked section as a clip in the list.\"\"\"\n    if transcriber.transcript is None:\n        raise HTTPException(status_code=400, detail=\"Transcribe the video first.\")\n    clip = clipper.manual_clip(transcriber.transcript, body.start, body.end, body.title)\n    if clip is None:\n        raise HTTPException(status_code=400, detail=\"Invalid start/end.\")\n    # Append only (no re-sort): keeps existing clip positions stable so the app's\n    # per-clip thumbnails/renders (keyed by position) never end up on the wrong\n    # clip. Under the clip lock so it's safe even during an AI find.\n    with clipper._lock:\n        if clipper.clips is None:\n            clipper.clips = []\n        clipper.clips.append(clip)\n    _persist()\n    return clipper.clips\n\n\n@app.get(\"/rendered-map\")\ndef rendered_map():\n    \"\"\"Which clips already have a rendered short on disk — so the app can show\n    the player + Share row again after a restart, without re-rendering.\"\"\"\n    out: dict = {}\n    if clipper.clips:\n        try:\n            files = os.listdir(renderer.OUTPUT_DIR)\n        except OSError:\n            files = []\n        for i in range(len(clipper.clips)):\n            # Match by the clip's position prefix (NN_...mp4), NOT the title — so a\n            # short still re-shows after the user renames the clip post-render.\n            prefix = f\"{i + 1:02d}_\"\n            matches = [f for f in files if f.startswith(prefix) and f.endswith(\".mp4\")]\n            if matches:\n                # newest wins (handles a re-render after a title change)\n                matches.sort(\n                    key=lambda f: os.path.getmtime(os.path.join(renderer.OUTPUT_DIR, f)),\n                    reverse=True,\n                )\n                out[str(i)] = matches[0]\n    return out\n\n\n@app.get(\"/rendered/{filename}\")\ndef get_rendered(filename: str):\n    \"\"\"Serve a finished MP4 so the app can play it.\"\"\"\n    safe = os.path.basename(filename)\n    path = os.path.join(renderer.OUTPUT_DIR, safe)\n    if not os.path.isfile(path):\n        raise HTTPException(status_code=404, detail=\"Not found.\")\n    return FileResponse(path, media_type=\"video/mp4\")\n\n\nclass ThumbRequest(BaseModel):\n    title: str | None = None\n    time: float | None = None  # seconds into the clip; None = auto-pick a frame\n    layout: str = \"crop\"       # crop (talking-head) or fit (screen recording)\n    position: str = \"bottom\"   # title position: bottom (default) or top\n\n\n@app.post(\"/thumbnail/{index}\")\ndef make_thumbnail(index: int, body: ThumbRequest):\n    \"\"\"Generate a thumbnail image (frame + title) for a clip.\"\"\"\n    if current_video is None:\n        raise HTTPException(status_code=400, detail=\"Load a video first.\")\n    if clipper.clips is None or index < 0 or index >= len(clipper.clips):\n        raise HTTPException(status_code=400, detail=\"No such clip.\")\n    layout = body.layout if body.layout in (\"crop\", \"fit\") else \"crop\"\n    position = body.position if body.position in (\"bottom\", \"top\") else \"bottom\"\n    try:\n        name = thumbnail.make_thumbnail(current_video[\"path\"], clipper.clips[index], index, body.title, body.time, layout, position)\n    except Exception as e:  # noqa: BLE001\n        raise HTTPException(status_code=500, detail=f\"Could not make the thumbnail: {e}\")\n    return {\"filename\": name}\n\n\n@app.get(\"/thumbnail/{filename}\")\ndef get_thumbnail(filename: str):\n    \"\"\"Serve a generated thumbnail image.\"\"\"\n    safe = os.path.basename(filename)\n    path = os.path.join(renderer.OUTPUT_DIR, safe)\n    if not os.path.isfile(path):\n        raise HTTPException(status_code=404, detail=\"Not found.\")\n    return FileResponse(path, media_type=\"image/jpeg\")\n\n\nPLATFORM_URLS = {\n    \"youtube\": \"https://www.youtube.com/upload\",\n    \"tiktok\": \"https://www.tiktok.com/upload\",\n    \"instagram\": \"https://www.instagram.com/\",\n    \"facebook\": \"https://www.facebook.com/reels/create\",\n    \"linkedin\": \"https://www.linkedin.com/feed/?shareActive=true\",\n}\n\n\ndef _reveal_in_folder(path: str):\n    \"\"\"Open the folder containing `path`, highlighting the file. Cross-platform.\"\"\"\n    try:\n        if sys.platform == \"win32\":\n            subprocess.Popen([\"explorer\", \"/select,\", os.path.normpath(path)])\n        elif sys.platform == \"darwin\":\n            subprocess.Popen([\"open\", \"-R\", path])\n        else:\n            subprocess.Popen([\"xdg-open\", os.path.dirname(path)])\n    except Exception:\n        pass\n\n\ndef _open_path(target: str):\n    \"\"\"Open a folder or URL with the OS default handler. Cross-platform.\"\"\"\n    try:\n        if sys.platform == \"win32\":\n            os.startfile(target)  # type: ignore[attr-defined]\n        elif sys.platform == \"darwin\":\n            subprocess.Popen([\"open\", target])\n        else:\n            subprocess.Popen([\"xdg-open\", target])\n    except Exception:\n        pass\n\n\n@app.post(\"/share\")\ndef share(platform: str, filename: str | None = None):\n    \"\"\"Open a social platform's upload page AND reveal the short, so the creator\n    can drag the video straight in. (No API/login wiring needed.)\"\"\"\n    url = PLATFORM_URLS.get(platform)\n    if not url:\n        raise HTTPException(status_code=400, detail=\"Unknown platform.\")\n    safe = os.path.basename((filename or \"\").split(\"?\")[0]) if filename else \"\"\n    path = os.path.join(renderer.OUTPUT_DIR, safe) if safe else \"\"\n    if path and os.path.isfile(path):\n        _reveal_in_folder(path)\n    import webbrowser\n    try:\n        webbrowser.open(url)\n    except Exception:\n        _open_path(url)\n    return {\"ok\": True, \"url\": url}\n\n\n@app.post(\"/reveal\")\ndef reveal(filename: str | None = None):\n    \"\"\"Open the output folder, selecting the given short if named. Cross-platform.\"\"\"\n    folder = renderer.OUTPUT_DIR\n    safe = os.path.basename((filename or \"\").split(\"?\")[0]) if filename else \"\"\n    path = os.path.join(folder, safe) if safe else \"\"\n    if path and os.path.isfile(path):\n        _reveal_in_folder(path)\n    else:\n        _open_path(folder)\n    return {\"ok\": True}\n","renderer.py":"\"\"\"Render a chosen clip into a vertical 9:16 MP4 using FFmpeg.\n\nPart 7a: cut the clip from the source video and centre-crop to 9:16. Captions\nand smart face-tracking crop come in later steps.\n\"\"\"\n\nimport glob\nimport os\nimport subprocess\nimport sys\nimport threading\n\nimport tracker\nimport tightener\n\n_BACKEND_DIR = os.path.dirname(__file__)\n_PROJECT_DIR = os.path.dirname(_BACKEND_DIR)\nTOOLS_DIR = os.path.join(_PROJECT_DIR, \"tools\")\nOUTPUT_DIR = os.path.join(_PROJECT_DIR, \"output\")\nos.makedirs(OUTPUT_DIR, exist_ok=True)\n\n\ndef _find_exe(name: str) -> str:\n    \"\"\"Locate a bundled tool under tools/, else trust PATH. `name` is the base\n    name (e.g. \"ffmpeg\") — matches ffmpeg.exe (Windows) or ffmpeg (Mac/Linux).\"\"\"\n    for cand in (name + \".exe\", name):\n        matches = glob.glob(os.path.join(TOOLS_DIR, \"**\", cand), recursive=True)\n        if matches:\n            return matches[0]\n    return name\n\n\nFFMPEG = _find_exe(\"ffmpeg\")\nFFPROBE = _find_exe(\"ffprobe\")\n\n\ndef _video_fps(source: str) -> float:\n    \"\"\"Frames per second of the source, so cuts can be snapped to whole frames\n    and audio/video stay perfectly in sync. Defaults to 30 if unknown.\"\"\"\n    try:\n        out = subprocess.run(\n            [FFPROBE, \"-v\", \"error\", \"-select_streams\", \"v:0\",\n             \"-show_entries\", \"stream=r_frame_rate\", \"-of\", \"csv=p=0\", source],\n            capture_output=True, text=True,\n        )\n        s = (out.stdout or \"\").strip()\n        if \"/\" in s:\n            num, den = s.split(\"/\")\n            return float(num) / float(den) if float(den) else 30.0\n        return float(s) if s else 30.0\n    except Exception:\n        return 30.0\n\n\n# Progress the API reads while rendering.\njob: dict = {\n    \"state\": \"idle\",     # idle | rendering | done | error\n    \"message\": \"\",\n    \"error\": None,\n    \"output\": None,      # output filename when done\n    \"index\": None,       # which clip is/was rendering\n}\n\n_lock = threading.Lock()\n\n\ndef _safe_name(title: str) -> str:\n    keep = \"\".join(c if (c.isalnum() or c in \" -_\") else \"\" for c in title).strip()\n    return (keep or \"clip\").replace(\" \", \"_\")[:50]\n\n\ndef _ass_time(t: float) -> str:\n    if t < 0:\n        t = 0\n    cs = int(round(t * 100))\n    h = cs // 360000\n    m = (cs % 360000) // 6000\n    s = (cs % 6000) // 100\n    c = cs % 100\n    return f\"{h}:{m:02d}:{s:02d}.{c:02d}\"\n\n\n# Caption font per OS — libass looks fonts up by NAME. Trebuchet MS on Windows\n# (the brand); a guaranteed-present face on Mac/Linux so captions never fall back\n# to an odd substitute. (Bold + the black stroke keep it strong on every OS.)\nif sys.platform == \"win32\":\n    _CAPTION_FONT = \"Trebuchet MS\"\nelif sys.platform == \"darwin\":\n    _CAPTION_FONT = \"Helvetica Neue\"\nelse:\n    _CAPTION_FONT = \"DejaVu Sans\"\n\n# Caption style. PrimaryColour &H00FF0014 = brand electric blue #1400ff (ASS is\n# AABBGGRR). Black outline (6px) + soft shadow so it reads over any footage.\n# Bold, italic. Alignment 2 = bottom-centre, raised with MarginV.\n_ASS_HEADER = f\"\"\"[Script Info]\nScriptType: v4.00+\nPlayResX: 1080\nPlayResY: 1920\nWrapStyle: 0\nScaledBorderAndShadow: yes\n\n[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\nStyle: Default,{_CAPTION_FONT},96,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,1,1,0,0,100,100,0,0,1,6,3,2,90,90,130,1\n\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, Effect, Text\n\"\"\"\n\n\n# Caption colours (ASS AABBGGRR): blue = highlight, white = the rest.\n_HL = \"&H00FF0014&\"   # brand electric blue #1400ff\n_FG = \"&H00FFFFFF&\"   # white\n_CHUNK_WORDS = 4      # how many words on screen at once\n# Nudge captions slightly LATER so they don't run ahead of the voice (speech\n# recognisers tend to mark words a touch early). Increase if still early.\n_CAPTION_DELAY = 0.15\n\n# A small tail so the last word isn't clipped (its end time is marked a bit early).\n_TAIL_PAD = 0.45\n\n\ndef _word_timings(seg: dict) -> list:\n    \"\"\"Per-word timings for a sentence, using the EDITED text.\n\n    If the edited word count matches the original word timings, pair them 1:1.\n    Otherwise (the user edited words) spread the sentence's time evenly so the\n    highlight still tracks the speech roughly.\n    \"\"\"\n    edited = seg[\"text\"].split()\n    if not edited:\n        return []\n    words = seg.get(\"words\") or []\n    if words and len(words) == len(edited):\n        return [\n            {\"text\": edited[i], \"start\": float(words[i][\"start\"]), \"end\": float(words[i][\"end\"])}\n            for i in range(len(edited))\n        ]\n    start = float(seg[\"start\"])\n    end = float(seg[\"end\"])\n    span = max(0.01, (end - start)) / len(edited)\n    return [\n        {\"text\": edited[i], \"start\": start + i * span, \"end\": start + (i + 1) * span}\n        for i in range(len(edited))\n    ]\n\n\ndef _esc(t: str) -> str:\n    return t.replace(\"\\\\\", \"\").replace(\"{\", \"(\").replace(\"}\", \")\")\n\n\ndef _line(group: list, active: int) -> str:\n    \"\"\"Words revealed up to the one being spoken (active word in blue), so the\n    captions never show words ahead of the voice.\"\"\"\n    parts = []\n    for k in range(active + 1):\n        col = _HL if k == active else _FG\n        parts.append(\"{\\\\c\" + col + \"}\" + _esc(group[k][\"text\"]))\n    return \" \".join(parts)\n\n\ndef _ass_dialogues(words: list) -> list:\n    \"\"\"Word-by-word highlighted Dialogue lines from a flat word list\n    (times already relative to the clip start).\"\"\"\n    out = []\n    for ci in range(0, len(words), _CHUNK_WORDS):\n        group = words[ci:ci + _CHUNK_WORDS]\n        for k, w in enumerate(group):\n            t0 = w[\"start\"] + _CAPTION_DELAY\n            t1 = (group[k + 1][\"start\"] if k < len(group) - 1 else w[\"end\"]) + _CAPTION_DELAY\n            if t1 <= t0:\n                t1 = t0 + 0.1\n            out.append(f\"Dialogue: 0,{_ass_time(t0)},{_ass_time(t1)},Default,,,,,{_line(group, k)}\")\n    return out\n\n\ndef _ass_from_words(words: list) -> str:\n    \"\"\"Full ASS from a flat word list (times relative to 0) — for tightened clips.\"\"\"\n    return \"\\n\".join([_ASS_HEADER] + _ass_dialogues(words))\n\n\ndef _build_ass(transcript: dict, clip: dict) -> str:\n    \"\"\"Full ASS per sentence, timed to the clip (no tightening).\"\"\"\n    segs = transcript[\"segments\"]\n    base = float(clip[\"start\"])\n    out = [_ASS_HEADER]\n    for si in range(clip[\"sent_start\"], clip[\"sent_end\"] + 1):\n        words = [{\"text\": w[\"text\"], \"start\": w[\"start\"] - base, \"end\": w[\"end\"] - base}\n                 for w in _word_timings(segs[si])]\n        out.extend(_ass_dialogues(words))\n    return \"\\n\".join(out)\n\n\ndef _fixed_crop(source: str, start: float, duration: float) -> str:\n    \"\"\"A single steady crop centred on the median face position (time-independent).\"\"\"\n    try:\n        sw, sh, pts = tracker.face_centers(source, start, duration)\n    except Exception:\n        sw = sh = pts = None\n    if not (sw and sh and pts):\n        return \"crop=ih*9/16:ih\"\n    ow = sh * 9.0 / 16.0\n    xs = tracker.ema(tracker.smooth([p[1] for p in pts], 9), 0.12)\n    cx = [min(max(x - ow / 2.0, 0.0), max(0.0, sw - ow)) for x in xs]\n    med = sorted(cx)[len(cx) // 2]\n    return f\"crop=w=ih*9/16:h=ih:x={int(round(med))}:y=0\"\n\n\ndef _build_crop(source: str, start: float, duration: float, index: int):\n    \"\"\"Work out the 9:16 crop. Follows the speaker's face if we can find it,\n    otherwise a plain centre-crop. Returns (crop_filter_string, cmds_filename).\"\"\"\n    try:\n        src_w, src_h, pts = tracker.face_centers(source, start, duration)\n    except Exception:\n        src_w = src_h = pts = None\n\n    if not (src_w and src_h and pts):\n        return \"crop=ih*9/16:ih\", None\n\n    ow = src_h * 9.0 / 16.0\n    # Strong low-pass smoothing so the crop drifts gently, not jittery.\n    raw = tracker.smooth([p[1] for p in pts], window=9)\n    xs = tracker.ema(raw, alpha=0.12)\n    ts = [p[0] for p in pts]\n    crop_xs = [min(max(cx - ow / 2.0, 0.0), max(0.0, src_w - ow)) for cx in xs]\n    spread = (max(crop_xs) - min(crop_xs)) if crop_xs else 0.0\n\n    # Hold still unless the speaker moves far enough that a fixed crop would let\n    # them drift toward the frame edge. The crop is ~ow wide, so movement under\n    # ~40% of that still keeps the face comfortably framed → rock-steady crop.\n    if spread < ow * 0.40:\n        med = sorted(crop_xs)[len(crop_xs) // 2]\n        return f\"crop=w=ih*9/16:h=ih:x={int(round(med))}:y=0\", None\n\n    # Big movement → a gently-following crop driven by a sendcmd script.\n    lines = []\n    last = None\n    for t, x in zip(ts, crop_xs):\n        xi = int(round(x))\n        if last is None or abs(xi - last) >= 3:\n            lines.append(f\"{t:.2f} crop x {xi};\")\n            last = xi\n    cmds_name = f\"_cmds_{index:02d}.txt\"\n    with open(os.path.join(OUTPUT_DIR, cmds_name), \"w\", encoding=\"utf-8\") as fh:\n        fh.write(\"\\n\".join(lines))\n    x0 = int(round(crop_xs[0]))\n    return f\"sendcmd=f={cmds_name},crop=w=ih*9/16:h=ih:x={x0}:y=0\", cmds_name\n\n\ndef _make_end_card_png(text: str, index: int):\n    \"\"\"Build the CTA end-card image: brand-styled, bold, word-wrapped, centred on\n    a near-black backdrop with an electric-blue accent. Returns the PNG path, or\n    None if it can't be made.\"\"\"\n    try:\n        from PIL import Image, ImageDraw, ImageFont\n        import thumbnail  # reuse the cross-platform bold-font finder + word-wrap\n        W, H = 1080, 1920\n        img = Image.new(\"RGB\", (W, H), (13, 13, 18))\n        draw = ImageDraw.Draw(img)\n        fp = thumbnail._font_path()\n        size = 112\n        font = ImageFont.truetype(fp, size) if fp else ImageFont.load_default()\n        margin = int(W * 0.09)\n        max_w = W - 2 * margin\n        lines = thumbnail._wrap(text, font, max_w, draw)\n        while len(lines) > 5 and size > 56:\n            size -= 8\n            font = ImageFont.truetype(fp, size) if fp else font\n            lines = thumbnail._wrap(text, font, max_w, draw)\n        line_h = int(size * 1.25)\n        block_h = line_h * len(lines)\n        y = (H - block_h) // 2\n        # brand electric-blue accent bar above the text (#1400ff)\n        bar_w, bar_h = int(W * 0.16), 14\n        draw.rectangle([((W - bar_w) // 2, y - 70), ((W + bar_w) // 2, y - 70 + bar_h)], fill=(20, 0, 255))\n        for ln in lines:\n            w = draw.textlength(ln, font=font)\n            draw.text(((W - w) / 2, y), ln, font=font, fill=(255, 255, 255),\n                      stroke_width=max(3, size // 22), stroke_fill=(0, 0, 0))\n            y += line_h\n        out = os.path.join(OUTPUT_DIR, f\"_endcard_{index:02d}.png\")\n        img.save(out)\n        return out\n    except Exception:\n        return None\n\n\ndef _append_end_card(short_path: str, cta_text: str, index: int):\n    \"\"\"Append a ~3s CTA end card to the finished short. Best-effort + FAIL-SAFE:\n    on ANY problem the short is left exactly as it was (a short is never lost).\"\"\"\n    text = (cta_text or \"\").strip()\n    if not text:\n        return\n    png = _make_end_card_png(text, index)\n    if not png:\n        return\n    tmp = os.path.join(OUTPUT_DIR, f\"_withcard_{index:02d}.mp4\")\n    try:\n        fps = _video_fps(short_path) or 30.0\n        # Normalise both pieces to the short's fps + yuv420p + 48k stereo audio so\n        # the concat is glitch-free regardless of the source's quirks.\n        filt = (\n            \"[0:a]aformat=sample_rates=48000:channel_layouts=stereo[a0];\"\n            \"[2:a]aformat=sample_rates=48000:channel_layouts=stereo[a1];\"\n            f\"[1:v]scale=1080:1920,setsar=1,fps={fps:.4f},format=yuv420p[c];\"\n            f\"[0:v]fps={fps:.4f},setsar=1,format=yuv420p[v0];\"\n            \"[v0][a0][c][a1]concat=n=2:v=1:a=1[v][a]\"\n        )\n        cmd = [\n            FFMPEG,\n            \"-i\", os.path.basename(short_path),\n            \"-loop\", \"1\", \"-t\", \"3\", \"-i\", os.path.basename(png),\n            \"-f\", \"lavfi\", \"-t\", \"3\", \"-i\", \"anullsrc=channel_layout=stereo:sample_rate=48000\",\n            \"-filter_complex\", filt,\n            \"-map\", \"[v]\", \"-map\", \"[a]\",\n            \"-c:v\", \"libx264\", \"-preset\", \"veryfast\", \"-crf\", \"20\", \"-pix_fmt\", \"yuv420p\",\n            \"-c:a\", \"aac\", \"-b:a\", \"128k\", \"-movflags\", \"+faststart\", \"-y\",\n            os.path.basename(tmp),\n        ]\n        res = subprocess.run(cmd, capture_output=True, text=True, cwd=OUTPUT_DIR)\n        if res.returncode == 0 and os.path.isfile(tmp) and os.path.getsize(tmp) > 0:\n            os.replace(tmp, short_path)  # swap in the version that has the card\n    except Exception:\n        pass\n    finally:\n        for f in (png, tmp):\n            try:\n                if os.path.isfile(f):\n                    os.remove(f)\n            except OSError:\n                pass\n\n\ndef _run(source: str, clip: dict, index: int, transcript: dict | None, tighten: bool = False, layout: str = \"crop\", strength: str = \"medium\", cta: str = \"\"):\n    sub_name = None\n    cmds_name = None\n    try:\n        job.update(state=\"rendering\", message=\"Making your short...\", error=None, output=None)\n        start = float(clip[\"start\"])\n        duration = float(clip[\"end\"]) - float(clip[\"start\"])\n        fname = f\"{index + 1:02d}_{_safe_name(clip.get('title', 'clip'))}.mp4\"\n\n        # Work out cuts (deleted words always; pauses/fillers if tighten on) and\n        # the matching captions.\n        plan = None\n        if transcript is not None and \"sent_start\" in clip:\n            plan = tightener.compute(transcript, clip, tighten, source, strength)\n        has_cuts = bool(plan and plan[\"removed_seconds\"] >= 0.05 and plan[\"keep_ranges\"])\n\n        # Captions file (re-timed to any cuts).\n        if plan and plan[\"caption_words\"]:\n            sub_name = f\"_sub_{index:02d}.ass\"\n            with open(os.path.join(OUTPUT_DIR, sub_name), \"w\", encoding=\"utf-8\") as fh:\n                fh.write(_ass_from_words(plan[\"caption_words\"]))\n        subs = f\",subtitles={sub_name}\" if sub_name else \"\"\n\n        # Optional cut (deleted words / tighten). Video = clean hard jump-cuts\n        # (standard for shorts). Audio = short CROSSFADES at each join, so there's\n        # no click AND no dip-to-silence \"hiccup\". The video pieces are trimmed by\n        # the same overlap the audio crossfades, so audio and video stay in sync.\n        if has_cuts:\n            fr = 1.0 / _video_fps(source)  # one frame, in seconds\n            def _snap(x):\n                return round(x / fr) * fr\n            ranges = [[_snap(a), _snap(b)] for a, b in plan[\"keep_ranges\"]]\n            # Drop slivers shorter than ~2 frames: ffmpeg's trim/crossfade can't\n            # render them and they're imperceptible (prevents a render failure on\n            # a tiny keep-range). Keep the originals if that would remove them all.\n            _clean = [r for r in ranges if (r[1] - r[0]) >= 2 * fr]\n            if _clean:\n                ranges = _clean\n            ranges[-1][1] = _snap(ranges[-1][1] + _TAIL_PAD)  # let the last word finish\n            n = len(ranges)\n            durs = [b - a for a, b in ranges]\n            # crossfade length per join (n-1): a few frames, never as long as a piece\n            joins = []\n            for k in range(n - 1):\n                dj = _snap(min(0.030, durs[k] / 2.2, durs[k + 1] / 2.2))\n                dj = max(fr, min(dj, durs[k] - fr, durs[k + 1] - fr))\n                joins.append(dj)\n\n            # Video: hard jump-cuts. Each piece drops its join-length off the end so\n            # the video loses exactly what the audio crossfade overlaps -> stays synced.\n            vparts = []\n            if n > 1:\n                vparts.append(f\"[0:v]split={n}\" + \"\".join(f\"[vin{k}]\" for k in range(n)))\n            for k, (a, b) in enumerate(ranges):\n                ve = b - (joins[k] if k < n - 1 else 0.0)\n                src = f\"[vin{k}]\" if n > 1 else \"[0:v]\"\n                vparts.append(f\"{src}trim={a:.4f}:{ve:.4f},setpts=PTS-STARTPTS[tv{k}]\")\n            if n > 1:\n                vparts.append(\"\".join(f\"[tv{k}]\" for k in range(n)) + f\"concat=n={n}:v=1:a=0[vc]\")\n                vbase = \"[vc]\"\n            else:\n                vbase = \"[tv0]\"\n            vcut_graph = \";\".join(vparts) + \";\"\n\n            # Audio: trim full pieces, then crossfade-chain them (smooth — no click,\n            # no dip-to-silence).\n            aparts = []\n            if n > 1:\n                aparts.append(f\"[0:a]asplit={n}\" + \"\".join(f\"[ain{k}]\" for k in range(n)))\n            for k, (a, b) in enumerate(ranges):\n                src = f\"[ain{k}]\" if n > 1 else \"[0:a]\"\n                aparts.append(f\"{src}atrim={a:.4f}:{b:.4f},asetpts=PTS-STARTPTS[aa{k}]\")\n            if n > 1:\n                prev = \"[aa0]\"\n                for k in range(1, n):\n                    out = \"[a]\" if k == n - 1 else f\"[ax{k}]\"\n                    aparts.append(f\"{prev}[aa{k}]acrossfade=d={joins[k - 1]:.4f}:c1=tri:c2=tri{out}\")\n                    prev = out\n            else:\n                aparts.append(\"[aa0]anull[a]\")\n            achain = \";\".join(aparts)\n        else:\n            vcut_graph = \"\"\n            vbase = \"[0:v]\"\n            achain = None\n\n        if layout == \"fit\":\n            # Whole 16:9 frame fit into 9:16 over a blurred fill — keeps the screen\n            # AND the webcam visible (good for screen recordings / Live Previews).\n            vchain = vcut_graph + (\n                f\"{vbase}split=2[bg][fg];\"\n                f\"[bg]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920,boxblur=18:1[bgb];\"\n                f\"[fg]scale=1080:1920:force_original_aspect_ratio=decrease[fgs];\"\n                f\"[bgb][fgs]overlay=(W-w)/2:(H-h)/2{subs}[v]\"\n            )\n        else:\n            # Talking-head: crop to 9:16, following the face.\n            if has_cuts:\n                crop_part = _fixed_crop(source, start, duration)\n            else:\n                crop_part, cmds_name = _build_crop(source, start, duration, index)\n            vchain = vcut_graph + f\"{vbase}{crop_part},scale=1080:1920,setsar=1{subs}[v]\"\n\n        filtergraph = vchain + (\";\" + achain if achain else \"\")\n        cmd = [\n            FFMPEG, \"-ss\", str(start), \"-i\", source, \"-t\", str(duration + _TAIL_PAD),\n            \"-filter_complex\", filtergraph,\n            \"-map\", \"[v]\", \"-map\", \"[a]\" if achain else \"0:a\",\n            \"-c:v\", \"libx264\", \"-preset\", \"veryfast\", \"-crf\", \"20\", \"-pix_fmt\", \"yuv420p\",\n            \"-c:a\", \"aac\", \"-b:a\", \"128k\",\n            \"-movflags\", \"+faststart\", \"-y\", fname,\n        ]\n        # Run from the output folder so the subtitles filter finds the .ass by name\n        # (avoids Windows path-escaping headaches in the FFmpeg filter).\n        res = subprocess.run(cmd, capture_output=True, text=True, cwd=OUTPUT_DIR)\n        if res.returncode != 0:\n            raise RuntimeError((res.stderr or \"ffmpeg failed\")[-600:])\n\n        # Optional CTA end card (best-effort; a card hiccup never breaks the short).\n        if cta:\n            _append_end_card(os.path.join(OUTPUT_DIR, fname), cta, index)\n\n        job.update(state=\"done\", message=\"Done!\", output=fname)\n    except Exception as e:  # noqa: BLE001\n        job.update(state=\"error\", error=str(e), message=\"Render failed.\")\n    finally:\n        for tmp in (sub_name, cmds_name):\n            if tmp:\n                try:\n                    os.remove(os.path.join(OUTPUT_DIR, tmp))\n                except OSError:\n                    pass\n\n\ndef start(source: str, clip: dict, index: int, transcript: dict | None = None, tighten: bool = False, layout: str = \"crop\", strength: str = \"medium\", cta: str = \"\"):\n    \"\"\"Render one clip in the background. Returns False if already busy.\"\"\"\n    with _lock:\n        if job[\"state\"] == \"rendering\":\n            return False\n        job.update(state=\"rendering\", message=\"Starting...\", error=None, output=None, index=index)\n    threading.Thread(target=_run, args=(source, clip, index, transcript, tighten, layout, strength, cta), daemon=True).start()\n    return True\n","requirements-nvidia.txt":"# NVIDIA GPU acceleration for transcription (installed only on machines with an\n# NVIDIA card). faster-whisper/ctranslate2 use these for CUDA; on other machines\n# the app falls back to the CPU and these are not installed.\nnvidia-cublas-cu12==12.9.2.10\nnvidia-cudnn-cu12==9.23.2.1\nnvidia-cuda-runtime-cu12==12.9.79\n","requirements.txt":"# Clip Wizard backend — base dependencies (Python 3.12 recommended).\n# NVIDIA GPU extras are in requirements-nvidia.txt, installed by the setup\n# script only when an NVIDIA card is present.\nfastapi==0.137.2\nuvicorn[standard]==0.49.0\nfaster-whisper==1.2.1\nopencv-python-headless==4.13.0.92\nyt-dlp==2026.6.9\nPillow==12.2.0\n","store.py":"\"\"\"Remember the user's work between sessions.\n\nSaves the current video, transcript and clips to a small file on disk so closing\nand reopening the app (or restarting the backend) doesn't lose anything.\n\nKeeps rolling backups so a stray action (e.g. loading a new video) can't wipe\nhard-won transcript edits beyond recovery.\n\"\"\"\n\nimport json\nimport os\nimport shutil\nimport time\n\n_DIR = os.path.dirname(os.path.dirname(__file__))\n_FILE = os.path.join(_DIR, \"session_state.json\")\n_BACKUP_DIR = os.path.join(_DIR, \"session_backups\")\n_MAX_BACKUPS = 15\n\n\ndef _snapshot():\n    \"\"\"Copy the current state file into session_backups/ before it's overwritten,\n    but only if it actually held a transcript (so we keep meaningful work).\"\"\"\n    try:\n        if not os.path.exists(_FILE):\n            return\n        with open(_FILE, encoding=\"utf-8\") as f:\n            prev = json.load(f)\n        if not prev.get(\"transcript\"):\n            return  # nothing worth keeping\n        os.makedirs(_BACKUP_DIR, exist_ok=True)\n        stamp = time.strftime(\"%Y%m%d_%H%M%S\")\n        shutil.copy2(_FILE, os.path.join(_BACKUP_DIR, f\"session_{stamp}.json\"))\n        backups = sorted(\n            os.path.join(_BACKUP_DIR, n) for n in os.listdir(_BACKUP_DIR) if n.endswith(\".json\")\n        )\n        for old in backups[:-_MAX_BACKUPS]:\n            try:\n                os.remove(old)\n            except OSError:\n                pass\n    except Exception:\n        pass  # backups are best-effort, never break saving\n\n\ndef save_state(data: dict):\n    try:\n        _snapshot()  # keep the previous state recoverable\n        tmp = _FILE + \".tmp\"\n        with open(tmp, \"w\", encoding=\"utf-8\") as f:\n            json.dump(data, f)\n        os.replace(tmp, _FILE)  # atomic — an interrupted save can't corrupt it\n    except Exception:\n        pass  # never let saving break the app\n\n\ndef load_state():\n    try:\n        if os.path.exists(_FILE):\n            with open(_FILE, encoding=\"utf-8\") as f:\n                return json.load(f)\n    except Exception:\n        return None\n    return None\n\n\ndef list_backups() -> list:\n    \"\"\"Most-recent-first list of backup files (for recovery).\"\"\"\n    try:\n        if not os.path.isdir(_BACKUP_DIR):\n            return []\n        names = [n for n in os.listdir(_BACKUP_DIR) if n.endswith(\".json\")]\n        return sorted((os.path.join(_BACKUP_DIR, n) for n in names), reverse=True)\n    except Exception:\n        return []\n","thumbnail.py":"\"\"\"Make a thumbnail image for a clip: a clean vertical frame (face-cropped, no\ncaptions) with a bold title overlaid. Saved next to the rendered short.\"\"\"\n\nimport os\nimport subprocess\n\nimport renderer  # reuse the bundled ffmpeg + the same face-crop as the render\n\nOUTPUT_DIR = renderer.OUTPUT_DIR\nFFMPEG = renderer.FFMPEG\n\n# Prefer Trebuchet MS Bold (matches the captions), then sensible fallbacks,\n# across Windows / Mac / Linux.\n_FONTS = [\n    # Windows\n    r\"C:\\Windows\\Fonts\\trebucbd.ttf\",\n    r\"C:\\Windows\\Fonts\\arialbd.ttf\",\n    r\"C:\\Windows\\Fonts\\segoeuib.ttf\",\n    # macOS — bold/brand first, then ALWAYS-present system faces, so a clean Mac\n    # (no Microsoft fonts) never falls back to PIL's tiny default font.\n    \"/System/Library/Fonts/Supplemental/Trebuchet MS Bold.ttf\",\n    \"/System/Library/Fonts/Supplemental/Arial Bold.ttf\",\n    \"/System/Library/Fonts/HelveticaNeue.ttc\",\n    \"/System/Library/Fonts/Helvetica.ttc\",\n    # Linux\n    \"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf\",\n]\n\n\ndef _font_path():\n    for f in _FONTS:\n        if os.path.isfile(f):\n            return f\n    return None\n\n\ndef _wrap(text, font, max_w, draw):\n    \"\"\"Word-wrap `text` so each line fits within max_w pixels.\"\"\"\n    words = text.split()\n    lines, cur = [], \"\"\n    for w in words:\n        trial = (cur + \" \" + w).strip()\n        if draw.textlength(trial, font=font) <= max_w or not cur:\n            cur = trial\n        else:\n            lines.append(cur)\n            cur = w\n    if cur:\n        lines.append(cur)\n    return lines\n\n\ndef make_thumbnail(source: str, clip: dict, index: int, title: str | None = None,\n                   offset: float | None = None, layout: str = \"crop\", position: str = \"bottom\") -> str:\n    \"\"\"Grab a frame, overlay the title, save as a JPG. Returns the filename.\n\n    `layout` matches the render: \"crop\" (talking-head, face-cropped) or \"fit\"\n    (whole frame on a blurred fill — for screen recordings).\n    `position` = \"bottom\" (default) or \"top\" — keeps the title off the face.\"\"\"\n    from PIL import Image, ImageDraw, ImageFont\n\n    base = float(clip[\"start\"])\n    dur = float(clip[\"end\"]) - base\n    if offset is None:\n        offset = dur / 3.0  # a frame a third of the way in is usually a good one\n    offset = max(0.0, min(dur, float(offset)))\n    t = base + offset\n\n    tmp = os.path.join(OUTPUT_DIR, f\"_thumbframe_{index}.png\")\n    if layout == \"fit\":\n        bg = \"scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920,boxblur=18:1\"\n        fg = \"scale=1080:1920:force_original_aspect_ratio=decrease\"\n        fc = f\"[0:v]split=2[b][f];[b]{bg}[bb];[f]{fg}[ff];[bb][ff]overlay=(W-w)/2:(H-h)/2[v]\"\n        cmd = [FFMPEG, \"-ss\", f\"{t:.3f}\", \"-i\", source, \"-frames:v\", \"1\",\n               \"-filter_complex\", fc, \"-map\", \"[v]\", \"-y\", tmp]\n    else:\n        crop = renderer._fixed_crop(source, base, dur)\n        cmd = [FFMPEG, \"-ss\", f\"{t:.3f}\", \"-i\", source, \"-frames:v\", \"1\",\n               \"-vf\", f\"{crop},scale=1080:1920\", \"-y\", tmp]\n    r = subprocess.run(cmd, capture_output=True, text=True)\n    if r.returncode != 0 or not os.path.isfile(tmp):\n        raise RuntimeError((r.stderr or \"Could not grab a frame.\")[-400:])\n\n    try:\n        img = Image.open(tmp).convert(\"RGB\")\n        W, H = img.size  # 1080 x 1920\n\n        text = (title if title is not None else clip.get(\"title\") or \"\").strip()\n        if text:\n            fp = _font_path()\n            size = 120\n            font = ImageFont.truetype(fp, size) if fp else ImageFont.load_default()\n            margin = int(W * 0.07)\n            max_w = W - 2 * margin\n\n            # Shrink the font until it fits in at most 4 lines.\n            draw = ImageDraw.Draw(img)\n            lines = _wrap(text, font, max_w, draw)\n            while len(lines) > 4 and size > 60:\n                size -= 8\n                font = ImageFont.truetype(fp, size) if fp else font\n                lines = _wrap(text, font, max_w, draw)\n\n            line_h = int(size * 1.15)\n            block_h = line_h * len(lines)\n            # Keep the title clear of the face: near the bottom (default) or top.\n            if position == \"top\":\n                y0 = int(H * 0.06)\n            else:\n                y0 = H - int(H * 0.06) - block_h\n\n            # Dark gradient band behind the text for guaranteed readability.\n            band = Image.new(\"RGBA\", (W, H), (0, 0, 0, 0))\n            bd = ImageDraw.Draw(band)\n            pad = int(size * 0.5)\n            top = max(0, y0 - pad)\n            bottom = min(H, y0 + block_h + pad)\n            for y in range(top, bottom):\n                # fade in/out at the edges of the band\n                d = min(y - top, bottom - y)\n                a = int(min(150, d * 6))\n                bd.line([(0, y), (W, y)], fill=(0, 0, 0, a))\n            img = Image.alpha_composite(img.convert(\"RGBA\"), band).convert(\"RGB\")\n\n            draw = ImageDraw.Draw(img)\n            y = y0\n            for ln in lines:\n                w = draw.textlength(ln, font=font)\n                x = (W - w) / 2\n                draw.text((x, y), ln, font=font, fill=(255, 255, 255),\n                          stroke_width=max(4, size // 18), stroke_fill=(0, 0, 0))\n                y += line_h\n\n        safe = renderer._safe_name(clip.get(\"title\") or \"clip\")\n        out_name = f\"{index + 1:02d}_{safe}_thumb.jpg\"\n        img.save(os.path.join(OUTPUT_DIR, out_name), quality=90)\n    finally:\n        try:\n            os.remove(tmp)\n        except OSError:\n            pass\n\n    return out_name\n","tidy.py":"\"\"\"Tidy a WHOLE video: remove long pauses + filler words across its full length,\nproducing a cleaned full-size MP4 (no crop, no captions). Then the cleaned video\n+ a re-timed transcript become the source for making shorts.\n\"\"\"\n\nimport difflib\nimport os\nimport subprocess\nimport threading\n\nimport renderer\nimport tightener\nimport transcriber\n\nOUTPUT_DIR = renderer.OUTPUT_DIR\nFFMPEG = renderer.FFMPEG\n\n# Progress the API reads while tidying.\njob: dict = {\n    \"state\": \"idle\",       # idle | planning | rendering | done | error\n    \"progress\": 0,\n    \"message\": \"\",\n    \"error\": None,\n    \"removed_seconds\": None,\n    \"new_duration\": None,\n    \"original_duration\": None,\n}\n\n# Set when a tidy finishes: the cleaned video + its re-timed transcript.\nresult: dict | None = None\n\n_lock = threading.Lock()\n\n\ndef plan(transcript: dict, source: str | None = None, strength: str = \"medium\") -> dict:\n    \"\"\"Whole-video keep-ranges + a transcript re-timed to the cleaned video.\"\"\"\n    segs = transcript[\"segments\"]\n    total_dur = float(transcript.get(\"duration\") or 0.0)\n\n    flat = []\n    for seg in segs:\n        for w in (seg.get(\"words\") or []):\n            flat.append({\"start\": float(w[\"start\"]), \"end\": float(w[\"end\"]), \"word\": w[\"word\"]})\n    if total_dur <= 0 and flat:\n        total_dur = flat[-1][\"end\"]\n\n    removals = []\n    filler_count = 0\n    pause_count = 0\n    deleted_count = 0\n\n    # Words/sentences the user DELETED in the transcript editor -> always cut.\n    # (An unedited sentence's text is built from its words, so nothing is flagged\n    #  unless the user actually removed something.)\n    for seg in segs:\n        _kept, deleted = tightener._aligned(seg)\n        for s, e in deleted:\n            if e > s:\n                removals.append((s, e))\n                deleted_count += 1\n\n    # Auto-trim pauses + fillers too, unless the user chose \"edits only\" (off).\n    if strength != \"off\":\n        for w in flat:\n            if tightener._is_filler(w[\"word\"]):\n                removals.append((w[\"start\"], w[\"end\"]))\n                filler_count += 1\n        if source:\n            for s, e in tightener.audio_silences(source, 0.0, total_dur, strength):\n                rs = s + 0.10\n                re_ = e - 0.10\n                if re_ - rs > 0.05:\n                    removals.append((rs, re_))\n                    pause_count += 1\n\n    removals = tightener._merge([(s, e) for s, e in removals if e > s])\n    total_removed = sum(e - s for s, e in removals)\n\n    keep = []\n    cur = 0.0\n    for s, e in removals:\n        if s > cur:\n            keep.append((cur, s))\n        cur = max(cur, e)\n    if cur < total_dur:\n        keep.append((cur, total_dur))\n\n    def removed_before(x: float) -> float:\n        tot = 0.0\n        for s, e in removals:\n            if e <= x:\n                tot += e - s\n            elif s < x:\n                tot += x - s\n        return tot\n\n    def overlap(rs: float, re_: float) -> float:\n        tot = 0.0\n        for s, e in removals:\n            lo = max(rs, s)\n            hi = min(re_, e)\n            if hi > lo:\n                tot += hi - lo\n        return tot\n\n    new_segs = []\n    for seg in segs:\n        kept = []\n        for w in (seg.get(\"words\") or []):\n            ws, we = float(w[\"start\"]), float(w[\"end\"])\n            if tightener._is_filler(w[\"word\"]):\n                continue\n            if (we - ws) > 0 and overlap(ws, we) > 0.5 * (we - ws):\n                continue\n            kept.append({\"start\": ws - removed_before(ws), \"end\": we - removed_before(we), \"word\": w[\"word\"]})\n        if not kept:\n            continue\n        text = \"\".join(x[\"word\"] for x in kept).strip()\n        new_segs.append({\"start\": kept[0][\"start\"], \"end\": kept[-1][\"end\"], \"text\": text, \"words\": kept})\n\n    new_transcript = {\n        \"language\": transcript.get(\"language\"),\n        \"duration\": round(total_dur - total_removed, 1),\n        \"text\": \" \".join(s[\"text\"] for s in new_segs).strip(),\n        \"segments\": new_segs,\n    }\n\n    return {\n        \"keep_ranges\": keep,\n        \"new_transcript\": new_transcript,\n        \"removed_seconds\": round(total_removed, 1),\n        \"pause_count\": pause_count,\n        \"filler_count\": filler_count,\n        \"deleted_count\": deleted_count,\n        \"new_duration\": round(total_dur - total_removed, 1),\n        \"original_duration\": round(total_dur, 1),\n    }\n\n\n# Process this many cut-ranges per FFmpeg pass. One giant filter with hundreds\n# of ranges runs FFmpeg out of memory, so we do it in small batches and stitch.\n_BATCH = 30\n\n\ndef _render(source: str, keep_ranges: list, out_path: str, total_kept: float):\n    \"\"\"Re-encode keeping only the wanted ranges, in batches, then stitch.\"\"\"\n    batches = [keep_ranges[i:i + _BATCH] for i in range(0, len(keep_ranges), _BATCH)]\n    parts = []\n    done_kept = 0.0\n    try:\n        for bi, batch in enumerate(batches):\n            b_start = batch[0][0]\n            b_end = batch[-1][1]\n            b_dur = b_end - b_start\n            # Ranges relative to this batch's start (accurate seek makes t=0 = b_start).\n            sel = \"+\".join(f\"between(t,{a - b_start:.3f},{b - b_start:.3f})\" for a, b in batch)\n            script = (\n                f\"[0:v]select='{sel}',setpts=N/FRAME_RATE/TB[v];\\n\"\n                f\"[0:a]aselect='{sel}',asetpts=N/SR/TB[a]\"\n            )\n            sp = os.path.join(OUTPUT_DIR, f\"_tidy_b{bi}.txt\")\n            with open(sp, \"w\", encoding=\"utf-8\") as fh:\n                fh.write(script)\n            part = os.path.join(OUTPUT_DIR, f\"_tidy_part_{bi:03d}.mp4\")\n            cmd = [\n                FFMPEG, \"-ss\", f\"{b_start:.3f}\", \"-t\", f\"{b_dur:.3f}\", \"-i\", source,\n                \"-filter_complex_script\", sp, \"-map\", \"[v]\", \"-map\", \"[a]\",\n                \"-c:v\", \"libx264\", \"-preset\", \"veryfast\", \"-crf\", \"20\", \"-pix_fmt\", \"yuv420p\",\n                \"-c:a\", \"aac\", \"-b:a\", \"160k\", \"-y\", part,\n            ]\n            r = subprocess.run(cmd, capture_output=True, text=True, cwd=OUTPUT_DIR)\n            try:\n                os.remove(sp)\n            except OSError:\n                pass\n            if r.returncode != 0:\n                raise RuntimeError((r.stderr or \"ffmpeg failed\")[-600:])\n            parts.append(part)\n            done_kept += sum(e - s for s, e in batch)\n            if total_kept > 0:\n                job[\"progress\"] = min(98, int(done_kept / total_kept * 100))\n\n        # Stitch the parts together (fast — no re-encode).\n        listfile = os.path.join(OUTPUT_DIR, \"_tidy_list.txt\")\n        with open(listfile, \"w\", encoding=\"utf-8\") as fh:\n            for p in parts:\n                fh.write(f\"file '{os.path.basename(p)}'\\n\")\n        cmd2 = [FFMPEG, \"-f\", \"concat\", \"-safe\", \"0\", \"-i\", listfile,\n                \"-c\", \"copy\", \"-movflags\", \"+faststart\", \"-y\", out_path]\n        r2 = subprocess.run(cmd2, capture_output=True, text=True, cwd=OUTPUT_DIR)\n        try:\n            os.remove(listfile)\n        except OSError:\n            pass\n        if r2.returncode != 0:\n            raise RuntimeError((r2.stderr or \"concat failed\")[-600:])\n    finally:\n        for p in parts:\n            try:\n                os.remove(p)\n            except OSError:\n                pass\n\n\ndef _carry_corrections(old_transcript: dict, new_tr: dict) -> dict:\n    \"\"\"Re-apply the user's word-fixes onto the freshly re-transcribed cleaned\n    video, so corrections made on the first transcript aren't lost.\n\n    The cleaned video is re-transcribed for accurate caption timing, which\n    re-introduces Whisper's original wording. The user's edited text is the\n    reviewed 'truth' for the kept audio, so we line it up against the new word\n    timings and keep the user's spelling wherever the fresh pass differs.\n    \"\"\"\n    her_words = []\n    for seg in old_transcript.get(\"segments\", []):\n        kept, _deleted = tightener._aligned(seg)\n        for w in kept:\n            t = (w.get(\"text\") or \"\").strip()\n            if t:\n                her_words.append(t)\n    new_flat = [w for seg in new_tr.get(\"segments\", []) for w in (seg.get(\"words\") or [])]\n    if not her_words or not new_flat:\n        return new_tr\n\n    a = [tightener._norm(t) for t in her_words]\n    b = [tightener._norm(w[\"word\"]) for w in new_flat]\n    for tag, i1, i2, j1, j2 in difflib.SequenceMatcher(a=a, b=b, autojunk=False).get_opcodes():\n        # Where the fresh transcription differs from the user's reviewed wording\n        # over a like-for-like span, adopt the user's spelling (keep the timing).\n        if tag == \"replace\" and (i2 - i1) == (j2 - j1):\n            for k in range(i2 - i1):\n                w = new_flat[j1 + k]\n                lead = \" \" if str(w[\"word\"]).startswith(\" \") else \"\"\n                w[\"word\"] = lead + her_words[i1 + k]\n\n    for seg in new_tr.get(\"segments\", []):\n        seg[\"text\"] = \"\".join(w[\"word\"] for w in (seg.get(\"words\") or [])).strip()\n    new_tr[\"text\"] = \" \".join(s[\"text\"] for s in new_tr.get(\"segments\", [])).strip()\n    return new_tr\n\n\ndef _run(source: str, transcript: dict, strength: str = \"medium\"):\n    global result\n    try:\n        job.update(state=\"planning\", progress=0, message=\"Working out what to trim...\", error=None)\n        p = plan(transcript, source, strength)\n        job[\"original_duration\"] = p[\"original_duration\"]\n        job[\"removed_seconds\"] = p[\"removed_seconds\"]\n        job[\"new_duration\"] = p[\"new_duration\"]\n\n        if not p[\"keep_ranges\"] or p[\"removed_seconds\"] < 0.5:\n            result = None\n            job.update(state=\"done\", progress=100, message=\"Nothing worth trimming.\")\n            return\n\n        name = renderer._safe_name(os.path.splitext(os.path.basename(source))[0])\n        out_path = os.path.join(OUTPUT_DIR, f\"tidied_{name}.mp4\")\n        total_kept = sum(e - s for s, e in p[\"keep_ranges\"])\n\n        job.update(state=\"rendering\", message=\"Tidying the full video (this takes a while)...\")\n        _render(source, p[\"keep_ranges\"], out_path, total_kept)\n\n        # Re-transcribe the CLEANED video so captions match it exactly (the\n        # estimated re-timing drifts; measuring the real timings is accurate).\n        job.update(state=\"transcribing\", progress=0,\n                   message=\"Re-transcribing the cleaned video for perfect captions...\")\n        new_tr, _dev = transcriber.transcribe_file(out_path, on_progress=lambda pr: job.update(progress=pr))\n        # Carry the user's word-fixes from the first transcript onto the cleaned\n        # one, so they never have to redo them.\n        new_tr = _carry_corrections(transcript, new_tr)\n\n        size = os.path.getsize(out_path)\n        result = {\n            \"path\": out_path,\n            \"name\": os.path.basename(out_path),\n            \"size\": f\"{size / (1024 * 1024):.1f} MB\",\n            \"transcript\": new_tr,\n            \"new_duration\": p[\"new_duration\"],\n        }\n        job.update(state=\"done\", progress=100, message=\"Done!\")\n    except Exception as e:  # noqa: BLE001\n        job.update(state=\"error\", error=str(e), message=\"Tidy failed.\")\n\n\ndef start(source: str, transcript: dict, strength: str = \"medium\"):\n    global result\n    with _lock:\n        if job[\"state\"] in (\"planning\", \"rendering\", \"transcribing\"):\n            return False\n        result = None\n        job.update(state=\"planning\", progress=0, message=\"Starting...\", error=None)\n    threading.Thread(target=_run, args=(source, transcript, strength), daemon=True).start()\n    return True\n","tightener.py":"\"\"\"Work out what to cut from a clip and how the captions should read.\n\nTwo sources of cuts:\n1. Words the user DELETED in the editor (compare edited text vs the original\n   spoken words) — these are cut from the video so they're truly gone.\n2. Optional 'tighten': long pauses + obvious fillers (um/uh).\n\nA word that was merely CHANGED (typo fix) is kept in the audio — only its\ncaption spelling updates. Produces keep-ranges for FFmpeg and re-timed caption\nwords for the shortened timeline. Nothing is cut until the user renders.\n\"\"\"\n\nimport difflib\nimport glob\nimport os\nimport subprocess\nimport tempfile\n\nFILLERS = {\"um\", \"umm\", \"uh\", \"uhh\", \"uhm\", \"erm\", \"er\", \"ah\", \"ahh\", \"hmm\", \"mm\", \"mmm\", \"mhm\"}\n\nPAUSE_THRESHOLD = 0.6\nKEEP_GAP = 0.15\n\n# Find the bundled ffmpeg (don't import renderer — avoids a circular import).\n# Match ffmpeg.exe (Windows) or ffmpeg (Mac/Linux).\n_tools = os.path.join(os.path.dirname(os.path.dirname(__file__)), \"tools\")\n_ff = (glob.glob(os.path.join(_tools, \"**\", \"ffmpeg.exe\"), recursive=True)\n       or glob.glob(os.path.join(_tools, \"**\", \"ffmpeg\"), recursive=True))\n_FFMPEG = _ff[0] if _ff else \"ffmpeg\"\n\n# Trim strength → (how far below the recording's average loudness counts as a\n# pause, shorter = more aggressive; minimum pause length to cut).\n_STRENGTH = {\n    \"gentle\": (9.0, 0.5),\n    \"medium\": (6.0, 0.35),\n    \"strong\": (3.0, 0.25),\n}\n\n\ndef _mean_volume(source: str, start: float, duration: float) -> float:\n    \"\"\"Average loudness (dB) of the audio, so the silence cut-off can adapt to\n    each recording. Falls back to -23 dB if it can't be measured.\"\"\"\n    cmd = [_FFMPEG, \"-ss\", f\"{start:.3f}\", \"-t\", f\"{duration:.3f}\", \"-i\", source,\n           \"-af\", \"volumedetect\", \"-f\", \"null\", \"-\"]\n    try:\n        r = subprocess.run(cmd, capture_output=True, text=True)\n    except Exception:\n        return -23.0\n    for line in (r.stderr or \"\").splitlines():\n        if \"mean_volume:\" in line:\n            try:\n                return float(line.split(\"mean_volume:\")[1].strip().split()[0])\n            except (ValueError, IndexError):\n                pass\n    return -23.0\n\n\ndef audio_silences(source: str, start: float, duration: float, strength: str = \"medium\") -> list:\n    \"\"\"Detect real silent gaps in the audio (relative to `start`), via FFmpeg.\n\n    Adapts the silence cut-off to the recording's own loudness, scaled by\n    `strength` (gentle/medium/strong). \"off\" = no pause trimming. Returns\n    [(start, end), ...], 0-based.\n    \"\"\"\n    if strength == \"off\":\n        return []\n    margin, min_d = _STRENGTH.get(strength, _STRENGTH[\"medium\"])\n    mean = _mean_volume(source, start, duration)\n    noise = max(-45.0, min(-16.0, mean - margin))\n    cmd = [_FFMPEG, \"-ss\", f\"{start:.3f}\", \"-t\", f\"{duration:.3f}\", \"-i\", source,\n           \"-af\", f\"silencedetect=noise={noise:.1f}dB:d={min_d}\", \"-f\", \"null\", \"-\"]\n    try:\n        r = subprocess.run(cmd, capture_output=True, text=True)\n    except Exception:\n        return []\n    silences = []\n    cs = None\n    for line in (r.stderr or \"\").splitlines():\n        if \"silence_start:\" in line:\n            try:\n                cs = float(line.split(\"silence_start:\")[1].strip().split()[0])\n            except (ValueError, IndexError):\n                cs = None\n        elif \"silence_end:\" in line and cs is not None:\n            try:\n                ce = float(line.split(\"silence_end:\")[1].split(\"|\")[0].strip())\n                silences.append((cs, ce))\n            except (ValueError, IndexError):\n                pass\n            cs = None\n    return silences\n\n\ndef _norm(s: str) -> str:\n    return \"\".join(c for c in s.lower() if c.isalnum())\n\n\ndef _is_filler(text: str) -> bool:\n    return _norm(text) in FILLERS\n\n\ndef _merge(ranges: list) -> list:\n    if not ranges:\n        return []\n    ranges = sorted(ranges)\n    out = [list(ranges[0])]\n    for a, b in ranges[1:]:\n        if a <= out[-1][1]:\n            out[-1][1] = max(out[-1][1], b)\n        else:\n            out.append([a, b])\n    return [(a, b) for a, b in out]\n\n\ndef _aligned(seg: dict):\n    \"\"\"Align the edited text to the original spoken words.\n\n    Returns (kept_words, deleted_ranges):\n      kept_words    = [{text, start, end}] words to keep (edited spelling, original timing)\n      deleted_ranges = [(start, end)] original words removed by the user -> cut these\n    \"\"\"\n    orig = seg.get(\"words\") or []\n    edited = seg[\"text\"].split()\n\n    if not orig:\n        start, end = float(seg[\"start\"]), float(seg[\"end\"])\n        if not edited:\n            return [], []\n        span = max(0.01, end - start) / len(edited)\n        return ([{\"text\": edited[i], \"start\": start + i * span, \"end\": start + (i + 1) * span}\n                 for i in range(len(edited))], [])\n\n    o_norm = [_norm(w[\"word\"]) for w in orig]\n    e_norm = [_norm(t) for t in edited]\n    sm = difflib.SequenceMatcher(a=o_norm, b=e_norm, autojunk=False)\n\n    kept = []\n    deleted = []\n    for tag, i1, i2, j1, j2 in sm.get_opcodes():\n        if tag == \"equal\":\n            for k in range(i2 - i1):\n                ow = orig[i1 + k]\n                kept.append({\"text\": edited[j1 + k], \"start\": float(ow[\"start\"]), \"end\": float(ow[\"end\"])})\n        elif tag == \"replace\":\n            ostart = float(orig[i1][\"start\"])\n            oend = float(orig[i2 - 1][\"end\"])\n            n = j2 - j1\n            if n == 0:\n                deleted.append((ostart, oend))\n            else:\n                span = (oend - ostart) / n\n                for k in range(n):\n                    kept.append({\"text\": edited[j1 + k], \"start\": ostart + k * span, \"end\": ostart + (k + 1) * span})\n        elif tag == \"delete\":\n            deleted.append((float(orig[i1][\"start\"]), float(orig[i2 - 1][\"end\"])))\n        elif tag == \"insert\":\n            anchor = float(orig[i1][\"start\"]) if i1 < len(orig) else float(orig[-1][\"end\"])\n            for k in range(j2 - j1):\n                kept.append({\"text\": edited[j1 + k], \"start\": anchor, \"end\": anchor + 0.3})\n\n    return kept, deleted\n\n\n# Cache fresh clip transcriptions so re-rendering the same clip is instant.\n# (Audio doesn't change with text edits, so caching by time-range is safe.)\n_clip_word_cache: dict = {}\n\n\ndef _clip_words_from_audio(source: str, base: float, dur: float) -> list:\n    \"\"\"Re-transcribe JUST this clip's audio to get word timings that line up with\n    the actual sound. The whole-video transcript drifts over long recordings, so\n    cuts based on it land in the wrong place; a short clip doesn't drift.\n\n    Returns [{word, start, end}] with times relative to `base` (0 = clip start),\n    matching the render's own `-ss base` reference exactly.\n    \"\"\"\n    key = (source, round(base, 2), round(dur, 2))\n    if key in _clip_word_cache:\n        return _clip_word_cache[key]\n    import transcriber  # local import avoids any import cycle\n    tmp = os.path.join(tempfile.gettempdir(), f\"_clipaud_{int(base * 1000)}_{int(dur * 1000)}.wav\")\n    cmd = [_FFMPEG, \"-ss\", f\"{base:.3f}\", \"-t\", f\"{dur:.3f}\", \"-i\", source,\n           \"-vn\", \"-ac\", \"1\", \"-ar\", \"16000\", \"-y\", tmp]\n    try:\n        subprocess.run(cmd, capture_output=True)\n        tr, _ = transcriber.transcribe_file(tmp)\n        words = [\n            {\"word\": w[\"word\"], \"start\": float(w[\"start\"]), \"end\": float(w[\"end\"])}\n            for s in tr[\"segments\"] for w in (s.get(\"words\") or [])\n        ]\n    except Exception:\n        words = []\n    finally:\n        try:\n            os.remove(tmp)\n        except OSError:\n            pass\n    _clip_word_cache[key] = words\n    return words\n\n\ndef _align_words(orig: list, edited: list, base: float = 0.0):\n    \"\"\"Align the user's edited word list to the actually-spoken words. Same logic\n    as _aligned but for a flat word list. Returns (kept_words, deleted_ranges)\n    with absolute times (base + word time).\"\"\"\n    o_norm = [_norm(w[\"word\"]) for w in orig]\n    e_norm = [_norm(t) for t in edited]\n    sm = difflib.SequenceMatcher(a=o_norm, b=e_norm, autojunk=False)\n    kept, deleted = [], []\n    for tag, i1, i2, j1, j2 in sm.get_opcodes():\n        if tag == \"equal\":\n            for k in range(i2 - i1):\n                ow = orig[i1 + k]\n                kept.append({\"text\": edited[j1 + k], \"start\": base + ow[\"start\"], \"end\": base + ow[\"end\"]})\n        elif tag == \"replace\":\n            ostart = base + orig[i1][\"start\"]\n            oend = base + orig[i2 - 1][\"end\"]\n            n = j2 - j1\n            if n == 0:\n                deleted.append((ostart, oend))\n            else:\n                span = (oend - ostart) / n\n                for k in range(n):\n                    kept.append({\"text\": edited[j1 + k], \"start\": ostart + k * span, \"end\": ostart + (k + 1) * span})\n        elif tag == \"delete\":\n            deleted.append((base + orig[i1][\"start\"], base + orig[i2 - 1][\"end\"]))\n        elif tag == \"insert\":\n            anchor = base + (orig[i1][\"start\"] if i1 < len(orig) else orig[-1][\"end\"])\n            for k in range(j2 - j1):\n                kept.append({\"text\": edited[j1 + k], \"start\": anchor, \"end\": anchor + 0.3})\n    return kept, deleted\n\n\ndef compute(transcript: dict, clip: dict, tighten: bool = False, source: str | None = None, strength: str = \"medium\") -> dict:\n    \"\"\"Return keep-ranges, re-timed caption words, and a summary for a clip.\n\n    Deleted words are always cut. When `tighten` is on (and `source` given), real\n    silent pauses are detected from the audio and cut too.\n    \"\"\"\n    segs = transcript[\"segments\"]\n    a = clip[\"sent_start\"]\n    b = clip[\"sent_end\"]\n    base = float(clip[\"start\"])\n    clip_end = float(clip[\"end\"])\n    clip_dur = clip_end - base\n\n    all_kept = []\n    del_ranges = []  # word-deletions (+ fillers), clip-relative — get gap-expanded\n    rel_rem = []     # final removals, clip-relative (0 = clip start)\n    deleted_count = 0\n\n    # Prefer fresh, drift-free word timings from the clip's own audio. Align the\n    # user's edited text to what was actually said; words that were said but are\n    # no longer in the text were deleted -> cut them (at accurate times).\n    fresh = _clip_words_from_audio(source, base, clip_dur + 0.5) if source else None\n    if fresh:\n        edited = \" \".join(segs[si][\"text\"] for si in range(a, b + 1)).split()\n        kept, deleted = _align_words(fresh, edited, base)\n        all_kept.extend(kept)\n        for s, e in deleted:\n            if e > s:\n                del_ranges.append((s - base, e - base))\n                deleted_count += 1\n    else:\n        for si in range(a, b + 1):\n            kept, deleted = _aligned(segs[si])\n            all_kept.extend(kept)\n            for s, e in deleted:\n                if e > s:\n                    del_ranges.append((s - base, e - base))\n                    deleted_count += 1\n\n    filler_count = 0\n    pause_count = 0\n    if tighten:\n        for w in all_kept:\n            if _is_filler(w[\"text\"]):\n                del_ranges.append((w[\"start\"] - base, w[\"end\"] - base))\n                filler_count += 1\n\n    # Expand each deleted word to the neighbouring KEPT-word edges, so the whole\n    # spoken word (its onset, tail, and the micro-gaps around it) is removed and\n    # the audio joins cleanly. Whisper's word bounds are often far too tight\n    # (e.g. \"Jen?\" = 0.08s), which would leave a fragment of the word audible.\n    kept_edges = sorted((w[\"start\"] - base, w[\"end\"] - base) for w in all_kept)\n    for ds, de in del_ranges:\n        prev_end = 0.0\n        next_start = clip_dur\n        for ks, ke in kept_edges:\n            if ke <= ds + 0.02:\n                if ke > prev_end:\n                    prev_end = ke\n            if ks >= de - 0.02:\n                next_start = ks\n                break\n        rel_rem.append((prev_end, next_start))\n\n    # Real pauses, detected from the audio (the transcript misses most).\n    if tighten and source:\n        for s, e in audio_silences(source, base, clip_dur, strength):\n            rs = s + 0.10\n            re_ = e - 0.10  # leave a small natural beat\n            if re_ - rs > 0.05:\n                rel_rem.append((rs, re_))\n                pause_count += 1\n\n    rel_rem = [(max(0.0, s), min(clip_dur, e)) for s, e in rel_rem if e > s]\n    rel_rem = _merge(rel_rem)\n    total_removed = sum(e - s for s, e in rel_rem)\n\n    keep = []\n    cur = 0.0\n    for s, e in rel_rem:\n        if s > cur:\n            keep.append((cur, s))\n        cur = max(cur, e)\n    if cur < clip_dur:\n        keep.append((cur, clip_dur))\n\n    def removed_before(x: float) -> float:\n        tot = 0.0\n        for s, e in rel_rem:\n            if e <= x:\n                tot += e - s\n            elif s < x:\n                tot += x - s\n        return tot\n\n    def overlap(rs: float, re_: float) -> float:\n        tot = 0.0\n        for s, e in rel_rem:\n            lo = max(rs, s)\n            hi = min(re_, e)\n            if hi > lo:\n                tot += hi - lo\n        return tot\n\n    cap = []\n    for w in sorted(all_kept, key=lambda w: w[\"start\"]):\n        rs = max(0.0, w[\"start\"] - base)\n        re_ = min(clip_dur, w[\"end\"] - base)\n        if re_ - rs <= 0:\n            continue\n        if overlap(rs, re_) > 0.5 * (re_ - rs):\n            continue  # this word's audio was cut → drop its caption too\n        ns = rs - removed_before(rs)\n        ne = re_ - removed_before(re_)\n        if ne > ns:\n            cap.append({\"text\": w[\"text\"], \"start\": ns, \"end\": ne})\n\n    return {\n        \"keep_ranges\": keep,\n        \"caption_words\": cap,\n        \"removed_seconds\": round(total_removed, 1),\n        \"filler_count\": filler_count,\n        \"pause_count\": pause_count,\n        \"deleted_count\": deleted_count,\n        \"new_duration\": round(clip_dur - total_removed, 1),\n        \"original_duration\": round(clip_dur, 1),\n    }\n","tracker.py":"\"\"\"Find where the speaker's face is across a clip, so the vertical crop can\nfollow them instead of a fixed centre-crop. Uses OpenCV's built-in face\ndetector (no extra downloads, no torch).\n\"\"\"\n\nimport cv2\n\n_FACE = cv2.CascadeClassifier(\n    cv2.data.haarcascades + \"haarcascade_frontalface_default.xml\"\n)\n\n\ndef face_centers(source: str, start: float, duration: float, samples_per_sec: float = 4.0):\n    \"\"\"Return (src_w, src_h, points) where points = [(t_rel, face_center_x), ...].\n\n    Samples the clip a few times a second and records the horizontal centre of\n    the largest face. Falls back to the last known centre when no face is seen.\n    Returns (None, None, None) if the video can't be read.\n    \"\"\"\n    cap = cv2.VideoCapture(source)\n    if not cap.isOpened():\n        return None, None, None\n\n    src_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) or 0\n    src_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) or 0\n    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0\n    if src_w == 0 or src_h == 0:\n        cap.release()\n        return None, None, None\n\n    cap.set(cv2.CAP_PROP_POS_MSEC, start * 1000.0)\n    frame_step = max(1, int(round(fps / samples_per_sec)))\n\n    points = []\n    last_cx = src_w / 2.0\n    n = 0\n    while True:\n        grabbed = cap.grab()  # fast: advance without decoding\n        if not grabbed:\n            break\n        t_rel = n / fps\n        if t_rel > duration:\n            break\n        if n % frame_step == 0:\n            ok, frame = cap.retrieve()\n            if ok and frame is not None:\n                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)\n                faces = _FACE.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(80, 80))\n                if len(faces):\n                    x, _y, w, _h = max(faces, key=lambda f: f[2] * f[3])\n                    last_cx = x + w / 2.0\n                points.append((t_rel, last_cx))\n        n += 1\n\n    cap.release()\n    return src_w, src_h, points\n\n\ndef smooth(values: list, window: int = 5) -> list:\n    \"\"\"Simple moving-average smoothing to avoid jittery crops.\"\"\"\n    if not values:\n        return values\n    half = window // 2\n    out = []\n    for i in range(len(values)):\n        lo = max(0, i - half)\n        hi = min(len(values), i + half + 1)\n        out.append(sum(values[lo:hi]) / (hi - lo))\n    return out\n\n\ndef ema(values: list, alpha: float = 0.15) -> list:\n    \"\"\"Zero-phase low-pass (forward + backward EMA) — strongly smooths the\n    face trajectory so the crop drifts gently instead of jittering.\"\"\"\n    if not values:\n        return values\n    fwd = []\n    s = values[0]\n    for v in values:\n        s = alpha * v + (1 - alpha) * s\n        fwd.append(s)\n    out = [0.0] * len(fwd)\n    s = fwd[-1]\n    for i in range(len(fwd) - 1, -1, -1):\n        s = alpha * fwd[i] + (1 - alpha) * s\n        out[i] = s\n    return out\n","transcriber.py":"\"\"\"Speech-to-text for Clip Wizard, using faster-whisper.\n\nRuns locally. Uses the NVIDIA GPU if available, otherwise the normal CPU.\n\"\"\"\n\nimport os\nimport threading\n\n\ndef _ensure_cuda_dlls():\n    \"\"\"Make GPU work on Windows.\n\n    The CUDA helper files arrive via pip inside site-packages\\\\nvidia\\\\...,\n    but the engine (CTranslate2) only finds them if they sit right next to it.\n    So we copy the needed ones into the ctranslate2 folder once. This is the\n    reliable Windows fix for \"cublas64_12.dll is not found or cannot be loaded\".\n    \"\"\"\n    import importlib.util\n    import shutil\n\n    spec = importlib.util.find_spec(\"ctranslate2\")\n    if not spec or not spec.submodule_search_locations:\n        return\n    ct_dir = spec.submodule_search_locations[0]\n\n    try:\n        import nvidia  # type: ignore\n        nvidia_roots = list(getattr(nvidia, \"__path__\", []))\n    except Exception:\n        return  # No GPU libs installed — we'll fall back to CPU later.\n    if not nvidia_roots:\n        return\n\n    needed = (\"cudart64_12.dll\", \"cublas64_12.dll\", \"cublasLt64_12.dll\")\n    for name in needed:\n        dst = os.path.join(ct_dir, name)\n        if os.path.exists(dst):\n            continue\n        for nvidia_root in nvidia_roots:\n            found = False\n            for root, _dirs, files in os.walk(nvidia_root):\n                if name in files:\n                    try:\n                        shutil.copy2(os.path.join(root, name), dst)\n                    except Exception:\n                        pass\n                    found = True\n                    break\n            if found:\n                break\n\n\n# Which Whisper model to use. \"medium\" is far more accurate than \"small\"\n# (small was dropping words) and still fits the 4 GB GPU comfortably.\nMODEL_SIZE = os.environ.get(\"CLIPWIZARD_MODEL\", \"medium\")\n\n# Shared progress that the API reads while a transcription runs.\njob: dict = {\n    \"state\": \"idle\",       # idle | loading | transcribing | done | error\n    \"progress\": 0,         # 0-100\n    \"message\": \"\",\n    \"device\": None,        # \"cuda\" or \"cpu\"\n    \"error\": None,\n}\n\n# The finished transcript (or None until done).\ntranscript: dict | None = None\n\n# Optional callback the app sets, called when a transcription finishes (to save).\non_complete = None\n\n_model = None\n_model_device = None\n_lock = threading.Lock()\n\n\ndef _load_model():\n    \"\"\"Load the model once. Try the GPU first, fall back to CPU.\n\n    The heavy faster-whisper / CUDA import is deferred to here (first transcribe)\n    so the app window opens fast instead of waiting ~9s for it at startup.\n    \"\"\"\n    global _model, _model_device\n    if _model is not None:\n        return _model, _model_device\n    if os.name == \"nt\":\n        _ensure_cuda_dlls()\n    from faster_whisper import WhisperModel  # heavy import — on first use only\n    try:\n        _model = WhisperModel(MODEL_SIZE, device=\"cuda\", compute_type=\"int8_float16\")\n        _model_device = \"cuda\"\n    except Exception:\n        _model = WhisperModel(MODEL_SIZE, device=\"cpu\", compute_type=\"int8\")\n        _model_device = \"cpu\"\n    return _model, _model_device\n\n\ndef _sentence_from_words(ws: list) -> dict:\n    return {\n        \"start\": ws[0][\"start\"],\n        \"end\": ws[-1][\"end\"],\n        \"text\": \"\".join(w[\"word\"] for w in ws).strip(),\n        \"words\": ws,\n    }\n\n\ndef _group_into_sentences(words: list) -> list:\n    \"\"\"Group word-level output into whole sentences (ending in . ! or ?).\n\n    Each sentence becomes one transcript line, with clean start/end times. This\n    is the single source of truth: the editor edits these lines, and the clip\n    finder uses them too — so corrections flow everywhere.\n    \"\"\"\n    if not words:\n        return []\n    sentences = []\n    cur: list = []\n    for w in words:\n        cur.append(w)\n        token = (w.get(\"word\") or \"\").strip()\n        if token.endswith((\".\", \"!\", \"?\")):\n            sentences.append(_sentence_from_words(cur))\n            cur = []\n    if cur:\n        sentences.append(_sentence_from_words(cur))\n    return sentences\n\n\ndef transcribe_file(path: str, on_progress=None):\n    \"\"\"Transcribe a file and return (transcript_dict, device). Reusable — used by\n    the normal flow AND by the tidy step (to re-transcribe the cleaned video so\n    captions match it exactly).\"\"\"\n    model, device = _load_model()\n    segments, info = model.transcribe(path, word_timestamps=True)\n\n    total = info.duration or 0\n    raw_segments = []\n    all_words = []\n    for seg in segments:\n        seg_words = [\n            {\"start\": w.start, \"end\": w.end, \"word\": w.word}\n            for w in (seg.words or [])\n        ]\n        raw_segments.append({\n            \"start\": seg.start,\n            \"end\": seg.end,\n            \"text\": seg.text.strip(),\n            \"words\": seg_words,\n        })\n        all_words.extend(seg_words)\n        if total and seg.end and on_progress:\n            on_progress(min(99, int(seg.end / total * 100)))\n\n    sentence_segments = _group_into_sentences(all_words) or raw_segments\n    result = {\n        \"language\": info.language,\n        \"duration\": total,\n        \"text\": \" \".join(s[\"text\"] for s in sentence_segments).strip(),\n        \"segments\": sentence_segments,\n    }\n    return result, device\n\n\ndef _run(path: str):\n    \"\"\"Do the actual transcription (runs on a background thread).\"\"\"\n    global transcript\n    try:\n        job.update(state=\"loading\", progress=0, message=\"Loading the speech model...\", error=None)\n        job.update(state=\"transcribing\", message=\"Listening to your video...\")\n        transcript, device = transcribe_file(path, on_progress=lambda p: job.update(progress=p))\n        job[\"device\"] = device\n        job.update(state=\"done\", progress=100, message=\"Done!\")\n        if on_complete:\n            on_complete()\n    except Exception as e:  # noqa: BLE001\n        job.update(state=\"error\", error=str(e), message=\"Something went wrong.\")\n\n\ndef start(path: str):\n    \"\"\"Kick off transcription in the background. Returns immediately.\"\"\"\n    global transcript\n    with _lock:\n        if job[\"state\"] in (\"loading\", \"transcribing\"):\n            return False  # already running\n        transcript = None\n        job.update(state=\"loading\", progress=0, message=\"Starting...\", error=None, device=None)\n    threading.Thread(target=_run, args=(path,), daemon=True).start()\n    return True\n","youtube.py":"\"\"\"Download a video from a YouTube (or other) URL using yt-dlp.\n\nRuns in the background with progress, like the transcribe/render steps. Saves to\nthe project's downloads/ folder (on D:), then hands the file to the normal flow.\n\"\"\"\n\nimport glob\nimport os\nimport threading\n\n_BACKEND_DIR = os.path.dirname(__file__)\n_PROJECT_DIR = os.path.dirname(_BACKEND_DIR)\nDOWNLOADS_DIR = os.path.join(_PROJECT_DIR, \"downloads\")\nTOOLS_DIR = os.path.join(_PROJECT_DIR, \"tools\")\nos.makedirs(DOWNLOADS_DIR, exist_ok=True)\n\n# Progress the API reads while downloading.\njob: dict = {\"state\": \"idle\", \"progress\": 0, \"message\": \"\", \"error\": None}\nresult: dict | None = None\non_complete = None  # main.py sets this to switch the app to the downloaded video\n\n_lock = threading.Lock()\n\n\ndef _ffmpeg_dir() -> str | None:\n    \"\"\"Folder of the bundled ffmpeg (yt-dlp needs it to merge video + audio).\n    Matches ffmpeg.exe (Windows) or ffmpeg (Mac/Linux).\"\"\"\n    matches = (glob.glob(os.path.join(TOOLS_DIR, \"**\", \"ffmpeg.exe\"), recursive=True)\n               or glob.glob(os.path.join(TOOLS_DIR, \"**\", \"ffmpeg\"), recursive=True))\n    return os.path.dirname(matches[0]) if matches else None\n\n\ndef _hook(d: dict):\n    status = d.get(\"status\")\n    if status == \"downloading\":\n        total = d.get(\"total_bytes\") or d.get(\"total_bytes_estimate\") or 0\n        done = d.get(\"downloaded_bytes\") or 0\n        pct = int(done / total * 100) if total else 0\n        job.update(state=\"downloading\", progress=min(99, pct), message=\"Downloading from YouTube...\")\n    elif status == \"finished\":\n        job.update(progress=99, message=\"Tidying up the file...\")\n\n\ndef _run(url: str):\n    global result\n    import yt_dlp  # local import keeps startup fast\n    try:\n        job.update(state=\"downloading\", progress=0, message=\"Looking up the video...\", error=None)\n        opts = {\n            \"format\": \"bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/b\",\n            \"merge_output_format\": \"mp4\",\n            \"outtmpl\": os.path.join(DOWNLOADS_DIR, \"%(title).80s.%(ext)s\"),\n            \"progress_hooks\": [_hook],\n            \"noplaylist\": True,\n            \"quiet\": True,\n            \"no_warnings\": True,\n            \"restrictfilenames\": True,\n        }\n        ffdir = _ffmpeg_dir()\n        if ffdir:\n            opts[\"ffmpeg_location\"] = ffdir\n\n        with yt_dlp.YoutubeDL(opts) as ydl:\n            info = ydl.extract_info(url, download=True)\n            path = ydl.prepare_filename(info)\n\n        # After a merge the extension becomes .mp4 — find the real file if needed.\n        if not os.path.isfile(path):\n            base = os.path.splitext(path)[0]\n            cands = glob.glob(base + \".*\")\n            path = cands[0] if cands else path\n        if not os.path.isfile(path):\n            raise RuntimeError(\"Download finished but the file could not be found.\")\n\n        result = {\"path\": path, \"name\": os.path.basename(path), \"size\": os.path.getsize(path)}\n        job.update(state=\"done\", progress=100, message=\"Done!\")\n        if on_complete:\n            on_complete(result)\n    except Exception as e:  # noqa: BLE001\n        msg = str(e)\n        job.update(state=\"error\", error=msg, message=\"Download failed.\")\n\n\ndef start(url: str) -> bool:\n    \"\"\"Begin downloading in the background. Returns False if already busy.\"\"\"\n    global result\n    with _lock:\n        if job[\"state\"] == \"downloading\":\n            return False\n        result = None\n        job.update(state=\"downloading\", progress=0, message=\"Starting...\", error=None)\n    threading.Thread(target=_run, args=(url,), daemon=True).start()\n    return True\n"}}