Mondego · katelykvuci · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/config.ini b/config.ini
@@ -1,6 +1,6 @@
 [IDENTIFICATION]
 # Set your user agent string here.
-USERAGENT = DEFAULT AGENT
+USERAGENT = IR UW26 15082480, 61528703, 24282816, 26671659
 
 [CONNECTION]
 HOST = styx.ics.uci.edu

diff --git a/crawler/worker.py b/crawler/worker.py
@@ -1,34 +1,83 @@
 from threading import Thread
-
+import hashlib
+from urllib.parse import urlparse, urlunparse
 from inspect import getsource
 from utils.download import download
+from utils.url_pattern_detection import get_url_pattern_hash
 from utils import get_logger
 import scraper
 import time
-
+from collections import defaultdict
+import tldextract
+import json
 
 class Worker(Thread):
     def __init__(self, worker_id, config, frontier):
         self.logger = get_logger(f"Worker-{worker_id}", "Worker")
         self.config = config
         self.frontier = frontier
+        self.seen_urls = set()
+        self.seen_url_patterns = defaultdict(int)
+        self.subdomains_count = defaultdict(int)
+        self.counts_stats_file = "count_stats.txt"
+        self.MAX_URL_PATTERN_HITS = 500
+        self.MAX_SUBDOMAIN_HITS = 10000
         # basic check for requests in scraper
         assert {getsource(scraper).find(req) for req in {"from requests import", "import requests"}} == {-1}, "Do not use requests in scraper.py"
         assert {getsource(scraper).find(req) for req in {"from urllib.request import", "import urllib.request"}} == {-1}, "Do not use urllib.request in scraper.py"
         super().__init__(daemon=True)
-
+    def write_stats(self):
+        data = {
+            "num_urls": len(self.seen_urls),
+            "subdomain_count": dict(self.subdomains_count)
+        }
+        with open(self.counts_stats_file, "w") as f:
+            json.dump(data, f, indent=4) 
     def run(self):
         while True:
             tbd_url = self.frontier.get_tbd_url()
-            if not tbd_url:
-                self.logger.info("Frontier is empty. Stopping Crawler.")
-                break
-            resp = download(tbd_url, self.config, self.logger)
-            self.logger.info(
-                f"Downloaded {tbd_url}, status <{resp.status}>, "
-                f"using cache {self.config.cache_server}.")
-            scraped_urls = scraper.scraper(tbd_url, resp)
-            for scraped_url in scraped_urls:
-                self.frontier.add_url(scraped_url)
-            self.frontier.mark_url_complete(tbd_url)
-            time.sleep(self.config.time_delay)
+            try:
+                if not tbd_url:
+                    self.logger.info("Frontier is empty. Stopping Crawler.")
+                    break
+                resp = download(tbd_url, self.config, self.logger)
+                self.logger.info(
+                    f"Downloaded {tbd_url}, status <{resp.status}>, "
+                    f"using cache {self.config.cache_server}.")
+                scraped_urls = scraper.scraper(tbd_url, resp)
+                for scraped_url in scraped_urls:
+                    parsed_url = urlparse(scraped_url)._replace(scheme='',fragment="")
+                    url_str =  urlunparse(parsed_url)
+                    hashed_url = hashlib.sha256(url_str.encode('utf-8')).hexdigest()
+                    hashed_url_pattern = get_url_pattern_hash(scraped_url)
+
+
+                    if hashed_url in self.seen_urls:
+                        print(f"Hashed url already seen...skipping")
+                        continue
+
+                    if self.seen_url_patterns[hashed_url_pattern] >= self.MAX_URL_PATTERN_HITS:
+                        print(f"Hashed url pattern reaached its limit:", hashed_url_pattern)
+                        continue
+
+                    curr_subdomain = tldextract.extract(parsed_url.hostname).subdomain
+                    if self.subdomains_count[curr_subdomain] >= self.MAX_SUBDOMAIN_HITS:
+                        print(f"Subdomain has reaached its limit:", subdomain)
+                        continue
+
+                    self.subdomains_count[curr_subdomain] += 1
+
+                    depth = len([segment for segment in parsed_url.path.split('/') if segment])
+                    if depth >= 6:
+                        print(f"URL depth is 6 or more...skipping")
+                        continue
+                    self.seen_url_patterns[hashed_url_pattern] += 1
+
+                    self.seen_urls.add(hashed_url)
+                    if len(self.seen_urls) % 10 == 0:
+                        self.write_stats()
+                    self.frontier.add_url(scraped_url)
+                self.frontier.mark_url_complete(tbd_url)
+                time.sleep(self.config.time_delay)
+            except Exception:
+                continue
diff --git a/crawler_statistics.txt b/crawler_statistics.txt
@@ -0,0 +1,53 @@
+Longest page URL: https://www.stat.uci.edu/research (651)
+
+50 most common words:
+statistics: 56
+uci: 39
+professor: 39
+edu: 30
+department: 24
+research: 24
+data: 24
+dbh: 24
+ph: 22
+graduate: 20
+faculty: 19
+sciences: 17
+statistical: 15
+news: 14
+university: 14
+interests: 14
+directory: 13
+opportunities: 12
+science: 12
+information: 11
+computer: 11
+contact: 11
+current: 11
+analysis: 11
+bayesian: 11
+chancellor: 11
+donald: 10
+bren: 10
+school: 10
+degrees: 10
+course: 10
+listings: 10
+students: 10
+chair: 9
+seminars: 9
+seminar: 9
+assistant: 9
+internships: 8
+employment: 8
+job: 8
+archive: 8
+949: 8
+824: 8
+student: 7
+undergraduate: 7
+yu: 7
+inference: 7
+resources: 6
+interdisciplinary: 6
+biostatistics: 6
diff --git a/packages/requirements.txt b/packages/requirements.txt
@@ -1,2 +1,5 @@
 cbor
-requests
+requests
+beautifulsoup4
+lxml
+tldextract
diff --git a/scraper.py b/scraper.py
@@ -1,5 +1,12 @@
 import re
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin, urldefrag
+from bs4 import BeautifulSoup
+from utils.tokenizer import tokenize, compute_word_frequencies
+from itertools import islice
+
+longest_page_url = None
+longest_page_word_count = 0
+most_common_words = {}
 
 def scraper(url, resp):
     links = extract_next_links(url, resp)
@@ -15,7 +22,55 @@ def extract_next_links(url, resp):
     #         resp.raw_response.url: the url, again
     #         resp.raw_response.content: the content of the page!
     # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
-    return list()
+    # if we can't scrape, return empty list
+
+    if resp.status != 200 or is_valid(resp.url) == False:
+        return []
+
+    soup = BeautifulSoup(resp.raw_response.content, 'lxml')
+
+    # extracting tokens
+    readable_text = soup.get_text(separator=' ')
+    tokens = tokenize(readable_text)
+
+    # updating statistics
+    update_stats(url, tokens)
+    write_to_file()
+
+    next_links = []
+
+    # if page is low-information, return empty list
+    if not has_sufficient_content(resp):
+        return []
+
+    # if page is marked too large by server, return empty list
+    if hasattr(resp, 'error') and resp.error and "607" in str(resp.error):
+        return []
+
+    next_links = []
+    soup = BeautifulSoup(resp.raw_response.content, 'lxml')
+    found_links = soup.find_all('a')
+    for link in found_links:
+        # print("debug link:", link)
+        # extract link
+        href = link.get('href')
+
+        if not href:
+            continue
+        absolute_url = urljoin(resp.url, href)
+        # absolute_url = absolute_url.replace(fragment="")
+        absolute_url, fragment = urldefrag(absolute_url)
+        #  parsed = urlparse(url)
+
+        # if new url is valid, add to list
+        if is_valid(absolute_url):
+            next_links.append(absolute_url)
+        else:
+            # prints when url isn't considered valid
+            print("invalid url, outside of expected domain")
+    # debug that prints next links
+    print(next_links)
+    return next_links
 
 def is_valid(url):
     # Decide whether to crawl this url or not. 
@@ -25,16 +80,153 @@ def is_valid(url):
         parsed = urlparse(url)
         if parsed.scheme not in set(["http", "https"]):
             return False
+
+        # check if url is within the 4 specified domains
+        if not within_domains(url):
+            return False
+
+        # check if url is allowed by robots.txt
+        if not obeys_robots_rules(url):
+            return False
+
+        # check if known trap
+        if not is_not_known_trap(url):
+            return False
+
         return not re.match(
             r".*\.(css|js|bmp|gif|jpe?g|ico"
             + r"|png|tiff?|mid|mp2|mp3|mp4"
-            + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+            + r"|wav|avi|mov|mpeg|mpg|ram|m4v|mkv|ogg|ogv|pdf"
             + r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
-            + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+            + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso|bib|pov|ff|lif"
             + r"|epub|dll|cnf|tgz|sha1"
             + r"|thmx|mso|arff|rtf|jar|csv"
-            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower())
+            + r"|c|cpp|cp|h|xml|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower())
 
     except TypeError:
         print ("TypeError for ", parsed)
         raise
+
+
+def within_domains(url):
+    allowed = ['ics.uci.edu', 'cs.uci.edu', 'informatics.uci.edu', 'stat.uci.edu']
+    hostname = urlparse(url).netloc
+    hostname = hostname.removeprefix('www.')
+    if hostname not in allowed:
+        return False
+    return True
+
+def update_stats(url, tokens):
+    global longest_page_url, longest_page_word_count, most_common_words
+    word_count = len(tokens)
+
+    # update longest page word count
+    if word_count > longest_page_word_count:
+        longest_page_word_count = word_count
+        longest_page_url = url
+
+    # update 50 most common words
+    word_freqs = compute_word_frequencies(tokens)
+    merge_frequencies(most_common_words, word_freqs)
+
+def merge_frequencies(global_word_freqs, word_freqs):
+    for token, count in word_freqs.items():
+        if token in global_word_freqs:
+            global_word_freqs[token] += count
+        else:
+            global_word_freqs[token] = count
+
+def write_to_file(file="crawler_statistics.txt"):
+    with open(file, "w") as f:
+        f.write("Longest page URL: ")
+        f.write(f"{longest_page_url} ({longest_page_word_count})\n\n")
+
+        f.write("50 most common words:\n")
+
+        # sort global word frequencies descending
+        sorted_freqs = {token : freq for token, freq in sorted(most_common_words.items(), key=lambda item:item[1], reverse=True)}
+
+        for word, freq in islice(sorted_freqs.items(), 50):
+            f.write(f"{word}: {freq}\n")
+
+
+def has_sufficient_content(resp, min_words=100, min_ratio=0.001):
+    # returns True if the page has enough textual/informational content to be useful
+    if resp.raw_response is None or resp.raw_response.content is None:
+        return False
+
+    content = resp.raw_response.content
+
+    # pages with small byte-size are probably low-information
+    # 404 error pages are typically 512 bytes
+    if len(content) <= 512:
+        return False
+
+    # checking visible word count of the page for info as well
+    soup = BeautifulSoup(content, 'lxml')
+
+    # remove non-visible / non-informational elements
+    for tag in soup(["script", "style", "noscript"]):
+        tag.decompose()
+
+    # tokenize visible text 
+    visible_text = soup.get_text(separator=' ')
+    words = tokenize(visible_text)
+
+    # check if number of words and ratio of HTML to text is sufficient
+    word_count = len(words)
+    ratio = word_count / max(len(content), 1) # max used to prevent division by 0
+    return word_count >= min_words and ratio >= min_ratio
+
+
+def is_not_known_trap(url):
+    trap_patterns = ["https://isg.ics.uci.edu/events/*", 
+                     "gitlab.ics.uci.edu",
+                     "http://fano.ics.uci.edu/ca/rules/",
+                     "/calendar", "/events"]
+
+    parsed = urlparse(url)
+    path = parsed.path.lower()
+    query = parsed.query.lower()
+
+    for pattern in trap_patterns:
+        if pattern in path or pattern in query:
+            return False
+
+    return True
+
+
+def obeys_robots_rules(url):
+    parsed = urlparse(url)
+    host = parsed.netloc.removeprefix("www.")
+    path = parsed.path
+
+    # informatics.uci.edu
+    inf_allowed_paths = ["/wp-admin/admin-ajax.php", "/research/labs-centers/", "/research/areas-of-expertise/", 
+                         "/research/example-research-projects/", "/research/phd-research/", "/research/past-dissertations/", 
+                         "/research/masters-research/", "/research/undergraduate-research/", "/research/gifts-grants/"]
+
+    if host.endswith("informatics.uci.edu"):
+        if path.startswith("/research"):
+            if not any(path.startswith(p) for p in inf_allowed_paths):
+                return False
+
+        if path.startswith("/wp-admin/") and not path.startswith("/wp-admin/admin-ajax.php"):
+            return False
+
+    # stat.uci.edu
+    if host.endswith("stat.uci.edu"):
+        if path.startswith("/people") or path.startswith("/happening"):
+            return False
+
+    # ics.uci.edu
+    if host.endswith("ics.uci.edu"):
+        if path.startswith("/people") or path.startswith("/happening"):
+            return False
+
+    # cs.uci.edu
+    if host.endswith("cs.uci.edu"):
+        if path.startswith("/people") or path.startswith("/happening"):
+            return False
+
+    return True