import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys

BASE_URL = "https://arch.b4k.dev"
SUBJECT_URL = "https://arch.b4k.dev/vg/search/subject/funkg/"
PAGE_LIMIT = None
# to set a limit, replace "None" with a number

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "en-US,en;q=0.9"
})

MAX_WORKERS = 10


def log(msg, level="INFO"):
    print(f"[{level}] {msg}")


def progress(msg):
    sys.stdout.write(f"\r{msg}")
    sys.stdout.flush()


def get_thread_links():
    links = set()
    page = 1

    while True:
        # stop if page limit reached
        if PAGE_LIMIT and page > PAGE_LIMIT:
            break

        url = SUBJECT_URL if page == 1 else f"{SUBJECT_URL}page/{page}/"

        try:
            response = session.get(url)
            soup = BeautifulSoup(response.text, "lxml")
        except Exception as e:
            log(f"Failed to fetch page {page}: {e}", "ERROR")
            break

        before = len(links)

        for a in soup.select('a[href*="/vg/thread/"]'):
            href = a.get("href")
            if not href:
                continue

            full_url = urljoin(BASE_URL, href.split("#")[0])

            if not full_url.endswith("/"):
                full_url += "/"

            links.add(full_url)

        new_links = len(links) - before

        progress(f"Scanning pages... Page {page} | Threads: {len(links)}")

        # Stop if no more new threads
        if new_links == 0:
            break

        page += 1

    print()
    log(f"Finished collecting threads: {len(links)}")
    return list(links)


def extract_thread_text(thread_url):
    try:
        response = session.get(thread_url)
        soup = BeautifulSoup(response.text, "lxml")

        posts = soup.select("article.post")

        # skip scraping threads with too few posts
        if len(posts) < 100:
            return []

        collected_text = []
        # ignore OP post text
        for post in posts:
            if "post_is_op" in post.get("class", []):
                continue

            text_div = post.select_one("div.text")
            if not text_div:
                continue

            text = text_div.get_text(" ", strip=True)
            if text:
                collected_text.append(text)

        return collected_text

    except Exception as e:
        log(f"Thread error: {thread_url} | {e}", "ERROR")
        return []


def clean_text(text_list):
    text = " ".join(text_list)

    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\b(br|amp|gt)\b", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()

    filterwords = {
        "use","way","dont","know","put","theyre","youre","because","this","that","about",
        "funkg","thread","post","reply","general","chan",
        "one","two","three","four","five","six","seven","eight","nine","ten",
    }

    text = " ".join(word for word in text.split()
                    if len(word) > 1 and word not in filterwords)

    return text


def generate_wordcloud(text):
    wc = WordCloud(
        width=1600,
        height=1000,
        background_color="black",
        stopwords=STOPWORDS,
        colormap="viridis",
        max_words=300,
    ).generate(text)

    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()


def main():
    log("Getting thread links...")
    links = get_thread_links()

    if not links:
        log("No threads found.", "ERROR")
        return

    all_text = []
    total = len(links)
    completed = 0

    log(f"Scraping {total} threads...")
    # threaded scraping, progress updates every thread

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {
            executor.submit(extract_thread_text, link): link
            for link in links
        }

        for future in as_completed(futures):
            link = futures[future]
            completed += 1

            try:
                thread_text = future.result()
                all_text.extend(thread_text)
            except Exception as e:
                log(f"{link} failed: {e}", "ERROR")
            if completed % 2 == 0 or completed == total:
                progress(
                    f"Processed {completed}/{total} threads | Posts: {len(all_text)}"
                )

    print()

    # last check to be sure we got text
    if not all_text:
        log("No text collected!", "ERROR")
        return

    log(f"Total posts collected: {len(all_text)}")

    cleaned = clean_text(all_text)
    log("Generating wordcloud...")

    generate_wordcloud(cleaned)
    log("Done!")


if __name__ == "__main__":
    main()
Edit

Pub: 11 Apr 2026 16:56 UTC

Views: 46