CBZGenerator/generate_cbz.py

#!/usr/bin/env python3
import argparse
import io
import json
import random
import zipfile
import re
from datetime import datetime
from pathlib import Path

try:
    from PIL import Image, ImageDraw, ImageFont
except ImportError:
    raise SystemExit("Please install Pillow first: pip install pillow")

MONTHS = [
    "January","February","March","April","May","June",
    "July","August","September","October","November","December"
]

MAX_YEAR = 2025  # <- hard cap

FORMAT_OPTIONS = [
    "Main Series",
    "Limited Series",
    "One-Shot",
    "TPB",
    "Annual",
    "Preview",
    "Balck & White",
    "Black & White",
    "Director'Cut",
    "Director's Cut",
    "Graphic Novel"
]

def normalize_format(fmt: str) -> str:
    f = fmt.strip().lower()
    if f == "main series":
        return "Main Series"
    if f in {"limited series", "limited"}:
        return "Limited Series"
    if f in {"one-shot", "oneshot"}:
        return "One-Shot"
    if f in {"tpb", "trade", "trade paperback"}:
        return "TPB"
    if f == "annual":
        return "Annual"
    if f in {"director's cut", "director'cut", "directors cut"}:
        return "Director's Cut"
    return fmt

def load_data(json_path: Path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    pubs_raw = data.get("publishers") or []
    if not pubs_raw or not isinstance(pubs_raw, list):
        raise ValueError("`publishers` must be a list of objects.")

    publishers, writers_by_pub, chars_by_pub = [], {}, {}
    for p in pubs_raw:
        if not isinstance(p, dict): continue
        name = p.get("name")
        if not name: continue
        publishers.append(name)
        writers_by_pub[name] = list(p.get("writers") or [])
        chars_by_pub[name]   = list(p.get("characters") or [])

    if not publishers:
        raise ValueError("No valid publishers with names found.")

    works = data.get("works") or []
    if not works:
        works = [
            "Odyssey","Legacy","Eclipse","Frontier","Spectrum","Monolith","Harbinger",
            "Chronicle","Vanguard","Paradox","Catalyst","Requiem","Arcadia","Equinox",
            "Ironclad","Apex","Arc","Vector","Nimbus","Cinder"
        ]
    return publishers, writers_by_pub, chars_by_pub, works

def slugify(text: str):
    keep = "-_.()[]#, "
    return "".join(c for c in text if c.isalnum() or c in keep).strip()

def rand_series_title(works):
    w1 = random.choice(works)
    pattern = random.choice([
        "{w1}","The {w1}","{w1} Chronicle","{w1}: Genesis","{w1} Rising","{w1} Reborn",
        "{w1} & {w2}","{w1} of {w2}","{w1}: {w2}",
    ])
    if "{w2}" in pattern:
        choices = [w for w in works if w != w1] or works
        w2 = random.choice(choices)
        return pattern.format(w1=w1, w2=w2)
    return pattern.format(w1=w1)

def choose_writer(publisher, writers_by_pub):
    lst = writers_by_pub.get(publisher) or []
    if lst: return random.choice(lst)
    return random.choice([
        "Alex Grant","Taylor Miller","Jordan Bishop","Morgan Reeves","Riley Carter",
        "Sam Hayes","Casey Harper","Jamie Brooks","Avery Collins","Quinn Rowe"
    ])

def choose_character(publisher, chars_by_pub):
    lst = chars_by_pub.get(publisher) or []
    if lst: return random.choice(lst)
    return random.choice([
        "Sentinel","Nightglass","Starflare","Iron Warden","Moonstrike","Volt Runner","Red Quill"
    ])

def add_months(year: int, month: int, delta: int):
    idx = (year * 12 + (month - 1)) + delta
    new_year = idx // 12
    new_month = (idx % 12) + 1
    return new_year, new_month

def sub_months(year: int, month: int, delta: int):
    idx = (year * 12 + (month - 1)) - delta
    new_year = idx // 12
    new_month = (idx % 12) + 1
    return new_year, new_month

def rand_start_date_for_monthly(n_issues: int, year_min: int = 1960):
    """
    Choose a random start (year,month) such that start + (n_issues-1) months <= Dec MAX_YEAR.
    """
    latest_y, latest_m = sub_months(MAX_YEAR, 12, max(0, n_issues - 1))
    # Build month-index range
    min_idx = year_min * 12  # Jan
    max_idx = latest_y * 12 + (latest_m - 1)
    if max_idx < min_idx:
        # If range is invalid, clamp to year_min Jan
        return year_min, 1
    pick = random.randint(min_idx, max_idx)
    return pick // 12, (pick % 12) + 1

def rand_start_date_for_annuals(n_issues: int, year_min: int = 1960):
    """
    Choose a start year so that start_year + (n_issues - 1) <= MAX_YEAR.
    Month can be any (fixed across annuals).
    """
    latest_start_year = MAX_YEAR - max(0, n_issues - 1)
    if latest_start_year < year_min:
        latest_start_year = year_min
    y = random.randint(year_min, latest_start_year)
    m = random.randint(1, 12)
    return y, m

def zero_pad_page(n: int) -> str:
    return f"P{n:05d}.jpg"

def make_jpeg_bytes(text: str, width=1200, height=1800):
    img = Image.new("RGB", (width, height), color="white")
    draw = ImageDraw.Draw(img)
    try:
        font = ImageFont.truetype("DejaVuSans.ttf", size=64)
    except Exception:
        font = ImageFont.load_default()

    lines = text.split("\n")
    sizes = []
    for line in lines:
        bbox = draw.textbbox((0,0), line, font=font)
        w, h = bbox[2]-bbox[0], bbox[3]-bbox[1]
        sizes.append((w,h))
    total_h = sum(h for _,h in sizes) + (len(lines)-1)*20
    y = (height - total_h) // 2
    for (line,(w,h)) in zip(lines, sizes):
        x = (width - w) // 2
        draw.text((x,y), line, fill="black", font=font)
        y += h + 20
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=90)
    return buf.getvalue()

def escape_xml(s: str) -> str:
    return (s.replace("&","&amp;")
             .replace("<","&lt;")
             .replace(">","&gt;")
             .replace('"',"&quot;")
             .replace("'","&apos;"))

def build_comicinfo_xml(series, number, title, volume_year, year, month,
                        publisher, writer, characters, fmt, page_count):
    chars_joined = ", ".join(characters if isinstance(characters, (list, tuple)) else [characters])
    xml = f"""<?xml version="1.0" encoding="utf-8"?>
<ComicInfo>
  <Title>{escape_xml(title)}</Title>
  <Series>{escape_xml(series)}</Series>
  <Number>{number}</Number>
  <Volume>{volume_year}</Volume>
  <Year>{year}</Year>
  <Month>{month}</Month>
  <Publisher>{escape_xml(publisher)}</Publisher>
  <Writer>{escape_xml(writer)}</Writer>
  <Characters>{escape_xml(chars_joined)}</Characters>
  <Format>{escape_xml(fmt)}</Format>
  <LanguageISO>en</LanguageISO>
  <PageCount>{page_count}</PageCount>
  <Summary>Generated for application stress testing.</Summary>
</ComicInfo>
"""
    return xml

def make_filename(series, issue_no, month_name, year):
    return f"{series} #{issue_no:03d} [{month_name}, {year}].cbz"

def issues_for_format(fmt_norm: str) -> int:
    if fmt_norm == "Main Series":       return random.randint(1, 500)
    if fmt_norm == "Limited Series":    return random.randint(1, 15)
    if fmt_norm == "One-Shot":          return 1
    if fmt_norm == "TPB":               return random.randint(1, 10)
    if fmt_norm == "Director's Cut":    return random.randint(1, 5)
    if fmt_norm == "Annual":            return random.randint(1, 5)
    return 1

# ---------- continue existing volumes ----------
def series_target_dir(base_out: Path, publisher: str, character: str, fmt_display: str,
                      volume_year: int, series: str) -> Path:
    series_folder_name = f"({volume_year}) {series}"
    return base_out / slugify(publisher) / slugify(character) / slugify(fmt_display) / slugify(series_folder_name)

def scan_existing_issue_info(target_dir: Path, series: str):
    """
    Returns (existing_max_issue_no, first_issue_year, first_issue_month)
    for files like 'Series Name #NNN [Month, Year].cbz'
    """
    if not target_dir.exists():
        return 0, None, None
    max_no = 0
    first_year = None
    first_month = None
    pat = re.compile(rf"^{re.escape(series)} #(\d{{3}}) \[([A-Za-z]+), (\d{{4}})\]$")
    for p in target_dir.glob("*.cbz"):
        m = pat.match(p.stem)
        if not m: continue
        n = int(m.group(1))
        mon_name = m.group(2)
        yr = int(m.group(3))
        if n > max_no: max_no = n
        if n == 1 and mon_name in MONTHS:
            first_month = MONTHS.index(mon_name) + 1
            first_year = yr
    return max_no, first_year, first_month

# ---------- core generation ----------
def generate_issue_cbz(base_out: Path, publisher: str, character: str, fmt_display: str,
                       series: str, issue_no: int, writer: str,
                       volume_year: int, issue_year: int, issue_month: int, page_count: int):
    # Enforce cap at the final gate too (paranoia)
    if issue_year > MAX_YEAR:
        return None

    month_name = MONTHS[issue_month - 1]
    title = f"{series} #{issue_no}"
    target_dir = series_target_dir(base_out, publisher, character, fmt_display, volume_year, series)
    target_dir.mkdir(parents=True, exist_ok=True)

    cbz_name = make_filename(series, issue_no, month_name, issue_year)
    cbz_path = target_dir / cbz_name

    comicinfo_xml = build_comicinfo_xml(
        series=series, number=issue_no, title=title,
        volume_year=volume_year, year=issue_year, month=issue_month,
        publisher=publisher, writer=writer, characters=[character],
        fmt=fmt_display, page_count=page_count
    )

    with zipfile.ZipFile(cbz_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for i in range(1, page_count + 1):
            filename = zero_pad_page(i)
            img_bytes = make_jpeg_bytes(cbz_name if i == 1 else f"Page {i}")
            zf.writestr(filename, img_bytes)
        zf.writestr("ComicInfo.xml", comicinfo_xml)
    return cbz_path

def estimate_total_issues(series_count: int):
    total = 0
    for _ in range(series_count):
        fmt = normalize_format(random.choice(FORMAT_OPTIONS))
        total += issues_for_format(fmt)
    return total  # rough estimate; cap may reduce actual total when continuing existing

def generate_one_series(base_out: Path, publishers, writers_by_pub, chars_by_pub, works,
                        counter, total_issues):
    publisher = random.choice(publishers)
    character = choose_character(publisher, chars_by_pub)
    fmt_display = random.choice(FORMAT_OPTIONS)
    fmt_norm = normalize_format(fmt_display)

    writer = choose_writer(publisher, writers_by_pub)
    series = rand_series_title(works)
    if random.random() < 0.35:
        series = f"{character}: {series}"

    n_issues = issues_for_format(fmt_norm)

    # Choose start date with MAX_YEAR cap in mind (unless continuing an existing volume)
    if fmt_norm == "Annual":
        start_year, start_month = rand_start_date_for_annuals(n_issues)
    else:
        start_year, start_month = rand_start_date_for_monthly(n_issues)

    volume_year = start_year  # volume = year of #1

    # If folder exists, continue numbering and keep original #1 date if found
    target_dir = series_target_dir(base_out, publisher, character, fmt_display, volume_year, series)
    existing_max, first_y, first_m = scan_existing_issue_info(target_dir, series)
    if first_y and first_m:
        start_year, start_month = first_y, first_m
        volume_year = first_y

    start_issue = max(1, existing_max + 1)
    if start_issue > n_issues:
        return  # nothing left to create

    def rand_pages(): return random.randint(5, 10)

    if fmt_norm == "Annual":
        # Issue i => year = start_year + (i-1), month fixed
        for issue_no in range(start_issue, n_issues + 1):
            y = start_year + (issue_no - 1)
            if y > MAX_YEAR:
                break
            m = start_month
            if generate_issue_cbz(base_out, publisher, character, fmt_display, series,
                                  issue_no, writer, volume_year, y, m, rand_pages()):
                counter[0] += 1
                if counter[0] % 100 == 0:
                    print(f"Generated {counter[0]} issues out of {total_issues}")
    else:
        # Monthly progression
        for issue_no in range(start_issue, n_issues + 1):
            y, m = add_months(start_year, start_month, issue_no - 1)
            if y > MAX_YEAR:
                break
            if generate_issue_cbz(base_out, publisher, character, fmt_display, series,
                                  issue_no, writer, volume_year, y, m, rand_pages()):
                counter[0] += 1
                if counter[0] % 100 == 0:
                    print(f"Generated {counter[0]} issues out of {total_issues}")

def main():
    parser = argparse.ArgumentParser(description="Generate CBZ files for stress testing.")
    parser.add_argument("count", type=int, help="Number of series to generate")
    parser.add_argument("--out", type=Path, default=Path("output_cbz"), help="Output base directory")
    parser.add_argument("--data", type=Path, default=Path("./comicdata.json"),
                        help="Path to comic data JSON (default: ./comicdata.json)")
    parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility")
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)

    publishers, writers_by_pub, chars_by_pub, works = load_data(args.data)
    args.out.mkdir(parents=True, exist_ok=True)

    total_issues_est = estimate_total_issues(args.count)  # rough
    if args.seed is not None:
        random.seed(args.seed)

    counter = [0]
    for _ in range(args.count):
        generate_one_series(args.out, publishers, writers_by_pub, chars_by_pub, works,
                            counter, total_issues_est)

    print(f"Done. Generated {counter[0]} issues total (estimated {total_issues_est}).")

if __name__ == "__main__":
    main()