Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
6663379
set useragent to specified value
josephine-33 Feb 2, 2026
455ad8f
oops! fixed useragent value and added student id
josephine-33 Feb 2, 2026
0d2d08d
charity id
charity-fan Feb 2, 2026
a22494d
added uci id
vidhyap22 Feb 2, 2026
39e87a4
add tokenizer file
vidhyap22 Feb 2, 2026
a218eea
added my uci id
katelykvuci Feb 2, 2026
5e421ea
updates to is_valid and extract_next_linke
josephine-33 Feb 4, 2026
48ac738
debugged within_domains
josephine-33 Feb 4, 2026
f3faedf
corrected import statements and requirements doc
josephine-33 Feb 4, 2026
aad2da0
changed user agent
katelykvuci Feb 4, 2026
8ef4997
fixed syntax errors in extract_next_links and added a few debug state…
katelykvuci Feb 4, 2026
2055c3a
Merge branch 'josephine'
josephine-33 Feb 6, 2026
7a3af3f
added tokenizer
charity-fan Feb 6, 2026
f70fa09
added helper function has_sufficient_content to check for low-info / …
josephine-33 Feb 6, 2026
4650052
fixed tokenize(), update global stats
charity-fan Feb 6, 2026
9e1b83e
writing stats to file
charity-fan Feb 6, 2026
18deb2a
added check to is_valid to make sure rules in robots.txt are followed
josephine-33 Feb 6, 2026
7a8e060
filter out stopwords
charity-fan Feb 6, 2026
3d590e2
add hashing
vidhyap22 Feb 6, 2026
67706b2
Merge branch 'master' into add_url_tracking
vidhyap22 Feb 6, 2026
23661be
Merge pull request #1 from josephine-33/add_url_tracking
vidhyap22 Feb 6, 2026
55af5d3
Merge pull request #2 from josephine-33/avoiding-dead-URLs
josephine-33 Feb 6, 2026
602f808
Merge pull request #3 from josephine-33/following-robot-rules
josephine-33 Feb 6, 2026
0ac0049
Merge branch 'master' into collecting-stats
charity-fan Feb 6, 2026
af27bac
Merge pull request #4 from josephine-33/collecting-stats
charity-fan Feb 6, 2026
9c493ea
start pattern detection
vidhyap22 Feb 7, 2026
b0c552f
added check for 607 error code, content too large to scrape
josephine-33 Feb 7, 2026
79f33a7
Merge branch 'master' into avoiding-dead-URLs
josephine-33 Feb 7, 2026
2ce42e0
Merge pull request #5 from josephine-33/avoiding-dead-URLs
josephine-33 Feb 7, 2026
5a62bf4
added url pattern creation functionality
vidhyap22 Feb 7, 2026
947fea3
Merge pull request #6 from josephine-33/add_url_pattern_detection
vidhyap22 Feb 7, 2026
4ec8eb8
add url hashing
vidhyap22 Feb 7, 2026
166f2a5
add max url pattern hits
vidhyap22 Feb 7, 2026
408a1c0
Merge branch 'master' into add_url_pattern_detection
vidhyap22 Feb 7, 2026
6d85a32
Merge pull request #7 from josephine-33/add_url_pattern_detection
vidhyap22 Feb 7, 2026
033266f
fix hashing bugs
vidhyap22 Feb 7, 2026
addfc3a
fix duplicate check
vidhyap22 Feb 7, 2026
ce2b5a1
add letter stopwords
Feb 7, 2026
6c02fe8
add subdomain counts
Feb 7, 2026
8fe82fd
add url/domain stat storage
Feb 7, 2026
11b1442
add error handling
Feb 7, 2026
860c2fa
remove fragment from hashed urls
Feb 7, 2026
84b122a
adding /events back to known traps
josephine-33 Feb 7, 2026
d2f42ab
add trap check
Feb 7, 2026
dbc4900
added mpg to disallowed file extensions
charity-fan Feb 7, 2026
a0a41b8
added |bib|pov|ff|lif to file extensions list
charity-fan Feb 7, 2026
2c995e4
add some invalid extensions
Feb 8, 2026
8764552
add .c extension
Feb 8, 2026
2873e36
changed max url pattern hits threshold to 500
charity-fan Feb 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[IDENTIFICATION]
# Set your user agent string here.
USERAGENT = DEFAULT AGENT
USERAGENT = IR UW26 15082480, 61528703, 24282816, 26671659

[CONNECTION]
HOST = styx.ics.uci.edu
Expand Down
79 changes: 64 additions & 15 deletions crawler/worker.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,83 @@
from threading import Thread

import hashlib
from urllib.parse import urlparse, urlunparse
from inspect import getsource
from utils.download import download
from utils.url_pattern_detection import get_url_pattern_hash
from utils import get_logger
import scraper
import time

from collections import defaultdict
import tldextract
import json

class Worker(Thread):
def __init__(self, worker_id, config, frontier):
self.logger = get_logger(f"Worker-{worker_id}", "Worker")
self.config = config
self.frontier = frontier
self.seen_urls = set()
self.seen_url_patterns = defaultdict(int)
self.subdomains_count = defaultdict(int)
self.counts_stats_file = "count_stats.txt"
self.MAX_URL_PATTERN_HITS = 500
self.MAX_SUBDOMAIN_HITS = 10000
# basic check for requests in scraper
assert {getsource(scraper).find(req) for req in {"from requests import", "import requests"}} == {-1}, "Do not use requests in scraper.py"
assert {getsource(scraper).find(req) for req in {"from urllib.request import", "import urllib.request"}} == {-1}, "Do not use urllib.request in scraper.py"
super().__init__(daemon=True)

def write_stats(self):
data = {
"num_urls": len(self.seen_urls),
"subdomain_count": dict(self.subdomains_count)
}
with open(self.counts_stats_file, "w") as f:
json.dump(data, f, indent=4)
def run(self):
while True:
tbd_url = self.frontier.get_tbd_url()
if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.")
break
resp = download(tbd_url, self.config, self.logger)
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
scraped_urls = scraper.scraper(tbd_url, resp)
for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
time.sleep(self.config.time_delay)
try:
if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.")
break
resp = download(tbd_url, self.config, self.logger)
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
scraped_urls = scraper.scraper(tbd_url, resp)
for scraped_url in scraped_urls:
parsed_url = urlparse(scraped_url)._replace(scheme='',fragment="")
url_str = urlunparse(parsed_url)
hashed_url = hashlib.sha256(url_str.encode('utf-8')).hexdigest()
hashed_url_pattern = get_url_pattern_hash(scraped_url)


if hashed_url in self.seen_urls:
print(f"Hashed url already seen...skipping")
continue

if self.seen_url_patterns[hashed_url_pattern] >= self.MAX_URL_PATTERN_HITS:
print(f"Hashed url pattern reaached its limit:", hashed_url_pattern)
continue

curr_subdomain = tldextract.extract(parsed_url.hostname).subdomain
if self.subdomains_count[curr_subdomain] >= self.MAX_SUBDOMAIN_HITS:
print(f"Subdomain has reaached its limit:", subdomain)
continue

self.subdomains_count[curr_subdomain] += 1

depth = len([segment for segment in parsed_url.path.split('/') if segment])
if depth >= 6:
print(f"URL depth is 6 or more...skipping")
continue
self.seen_url_patterns[hashed_url_pattern] += 1

self.seen_urls.add(hashed_url)
if len(self.seen_urls) % 10 == 0:
self.write_stats()
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
time.sleep(self.config.time_delay)
except Exception:
continue
53 changes: 53 additions & 0 deletions crawler_statistics.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
Longest page URL: https://www.stat.uci.edu/research (651)

50 most common words:
statistics: 56
uci: 39
professor: 39
edu: 30
department: 24
research: 24
data: 24
dbh: 24
ph: 22
graduate: 20
faculty: 19
sciences: 17
statistical: 15
news: 14
university: 14
interests: 14
directory: 13
opportunities: 12
science: 12
information: 11
computer: 11
contact: 11
current: 11
analysis: 11
bayesian: 11
chancellor: 11
donald: 10
bren: 10
school: 10
degrees: 10
course: 10
listings: 10
students: 10
chair: 9
seminars: 9
seminar: 9
assistant: 9
internships: 8
employment: 8
job: 8
archive: 8
949: 8
824: 8
student: 7
undergraduate: 7
yu: 7
inference: 7
resources: 6
interdisciplinary: 6
biostatistics: 6
5 changes: 4 additions & 1 deletion packages/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
cbor
requests
requests
beautifulsoup4
lxml
tldextract
202 changes: 197 additions & 5 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import re
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin, urldefrag
from bs4 import BeautifulSoup
from utils.tokenizer import tokenize, compute_word_frequencies
from itertools import islice

longest_page_url = None
longest_page_word_count = 0
most_common_words = {}

def scraper(url, resp):
links = extract_next_links(url, resp)
Expand All @@ -15,7 +22,55 @@ def extract_next_links(url, resp):
# resp.raw_response.url: the url, again
# resp.raw_response.content: the content of the page!
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
return list()
# if we can't scrape, return empty list

if resp.status != 200 or is_valid(resp.url) == False:
return []

soup = BeautifulSoup(resp.raw_response.content, 'lxml')

# extracting tokens
readable_text = soup.get_text(separator=' ')
tokens = tokenize(readable_text)

# updating statistics
update_stats(url, tokens)
write_to_file()

next_links = []

# if page is low-information, return empty list
if not has_sufficient_content(resp):
return []

# if page is marked too large by server, return empty list
if hasattr(resp, 'error') and resp.error and "607" in str(resp.error):
return []

next_links = []
soup = BeautifulSoup(resp.raw_response.content, 'lxml')
found_links = soup.find_all('a')
for link in found_links:
# print("debug link:", link)
# extract link
href = link.get('href')

if not href:
continue
absolute_url = urljoin(resp.url, href)
# absolute_url = absolute_url.replace(fragment="")
absolute_url, fragment = urldefrag(absolute_url)
# parsed = urlparse(url)

# if new url is valid, add to list
if is_valid(absolute_url):
next_links.append(absolute_url)
else:
# prints when url isn't considered valid
print("invalid url, outside of expected domain")
# debug that prints next links
print(next_links)
return next_links

def is_valid(url):
# Decide whether to crawl this url or not.
Expand All @@ -25,16 +80,153 @@ def is_valid(url):
parsed = urlparse(url)
if parsed.scheme not in set(["http", "https"]):
return False

# check if url is within the 4 specified domains
if not within_domains(url):
return False

# check if url is allowed by robots.txt
if not obeys_robots_rules(url):
return False

# check if known trap
if not is_not_known_trap(url):
return False

return not re.match(
r".*\.(css|js|bmp|gif|jpe?g|ico"
+ r"|png|tiff?|mid|mp2|mp3|mp4"
+ r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+ r"|wav|avi|mov|mpeg|mpg|ram|m4v|mkv|ogg|ogv|pdf"
+ r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso|bib|pov|ff|lif"
+ r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower())
+ r"|c|cpp|cp|h|xml|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower())

except TypeError:
print ("TypeError for ", parsed)
raise


def within_domains(url):
allowed = ['ics.uci.edu', 'cs.uci.edu', 'informatics.uci.edu', 'stat.uci.edu']
hostname = urlparse(url).netloc
hostname = hostname.removeprefix('www.')
if hostname not in allowed:
return False
return True

def update_stats(url, tokens):
global longest_page_url, longest_page_word_count, most_common_words
word_count = len(tokens)

# update longest page word count
if word_count > longest_page_word_count:
longest_page_word_count = word_count
longest_page_url = url

# update 50 most common words
word_freqs = compute_word_frequencies(tokens)
merge_frequencies(most_common_words, word_freqs)

def merge_frequencies(global_word_freqs, word_freqs):
for token, count in word_freqs.items():
if token in global_word_freqs:
global_word_freqs[token] += count
else:
global_word_freqs[token] = count

def write_to_file(file="crawler_statistics.txt"):
with open(file, "w") as f:
f.write("Longest page URL: ")
f.write(f"{longest_page_url} ({longest_page_word_count})\n\n")

f.write("50 most common words:\n")

# sort global word frequencies descending
sorted_freqs = {token : freq for token, freq in sorted(most_common_words.items(), key=lambda item:item[1], reverse=True)}

for word, freq in islice(sorted_freqs.items(), 50):
f.write(f"{word}: {freq}\n")


def has_sufficient_content(resp, min_words=100, min_ratio=0.001):
# returns True if the page has enough textual/informational content to be useful
if resp.raw_response is None or resp.raw_response.content is None:
return False

content = resp.raw_response.content

# pages with small byte-size are probably low-information
# 404 error pages are typically 512 bytes
if len(content) <= 512:
return False

# checking visible word count of the page for info as well
soup = BeautifulSoup(content, 'lxml')

# remove non-visible / non-informational elements
for tag in soup(["script", "style", "noscript"]):
tag.decompose()

# tokenize visible text
visible_text = soup.get_text(separator=' ')
words = tokenize(visible_text)

# check if number of words and ratio of HTML to text is sufficient
word_count = len(words)
ratio = word_count / max(len(content), 1) # max used to prevent division by 0
return word_count >= min_words and ratio >= min_ratio


def is_not_known_trap(url):
trap_patterns = ["https://isg.ics.uci.edu/events/*",
"gitlab.ics.uci.edu",
"http://fano.ics.uci.edu/ca/rules/",
"/calendar", "/events"]

parsed = urlparse(url)
path = parsed.path.lower()
query = parsed.query.lower()

for pattern in trap_patterns:
if pattern in path or pattern in query:
return False

return True


def obeys_robots_rules(url):
parsed = urlparse(url)
host = parsed.netloc.removeprefix("www.")
path = parsed.path

# informatics.uci.edu
inf_allowed_paths = ["/wp-admin/admin-ajax.php", "/research/labs-centers/", "/research/areas-of-expertise/",
"/research/example-research-projects/", "/research/phd-research/", "/research/past-dissertations/",
"/research/masters-research/", "/research/undergraduate-research/", "/research/gifts-grants/"]

if host.endswith("informatics.uci.edu"):
if path.startswith("/research"):
if not any(path.startswith(p) for p in inf_allowed_paths):
return False

if path.startswith("/wp-admin/") and not path.startswith("/wp-admin/admin-ajax.php"):
return False

# stat.uci.edu
if host.endswith("stat.uci.edu"):
if path.startswith("/people") or path.startswith("/happening"):
return False

# ics.uci.edu
if host.endswith("ics.uci.edu"):
if path.startswith("/people") or path.startswith("/happening"):
return False

# cs.uci.edu
if host.endswith("cs.uci.edu"):
if path.startswith("/people") or path.startswith("/happening"):
return False

return True
Loading