Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 59 additions & 22 deletions grab.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,64 +39,97 @@ def get_data(self):


def strip_tags(html):
"""Function to strip HTML tags, but preserve <pre> and <code> tags for code snippets."""
s = MLStripper()
s.feed(html)
return s.get_data()

stripped_html = s.get_data()

# Handle <pre> and <code> tags manually for code snippets
stripped_html = stripped_html.replace('<pre>', '\n.. code-block:: python\n\n ')
stripped_html = stripped_html.replace('</pre>', '\n')
stripped_html = stripped_html.replace('<code>', '`')
stripped_html = stripped_html.replace('</code>', '`')

return stripped_html


def remove_indent(text):
"""Removes leading and trailing spaces from each line in the text."""
lines = text.splitlines()
lines = [x.strip() for x in lines]
return "\n".join(lines)


def html2rst_allign_post(text):
lines = text.splitlines()
"""Converts HTML content into reStructuredText (rst) format, handling code snippets."""
if not text:
raise ValueError("Empty post!")

if len(lines) < 2:
lines = '\n'.join(lines).replace("br>", "newline>\n").replace("/p>", "/pline>\n").splitlines()
lines = [x.replace("newline>", "br>").replace("/pline>","/p>") for x in lines]
lines = text.splitlines()

# Insert teaser marker
lines.insert(3, "<!-- TEASER_END -->")

# Convert <pre> and <code> HTML tags into rst code block formatting
text_with_code_blocks = text.replace('<pre>', '\n.. code-block:: python\n\n ')
text_with_code_blocks = text_with_code_blocks.replace('</pre>', '\n')
text_with_code_blocks = text_with_code_blocks.replace('<code>', '`')
text_with_code_blocks = text_with_code_blocks.replace('</code>', '`')

lines = text_with_code_blocks.splitlines()
lines = [" " + x.strip() for x in lines]
lines = [".. raw:: html", ""] + lines

return "\n".join(lines)


def grab_student(last_date, rss_url, project, student, season):
"""Fetches blog posts of students and processes them into rst files."""
feed = feedparser.parse(rss_url)
dates = [last_date]

for item in feed['items']:
item_date = dt.datetime.fromtimestamp(time.mktime(item['published_parsed']))
# item_date.tm_zone; tm_gmtoff
# wordpress; time.struct_time

# Filter posts after the last grabbed date
if item_date > last_date:
# Handle Medium posts specifically related to GSoC
if "https://medium" in rss_url and not any('gsoc' in x.get('term').lower() for x in item.get('tags', [{'term': ''}])):
continue

print("#################### New post!")
pp.pprint(item)
dates.append(item_date)

Comment on lines -76 to +103
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably shouldn't be deleted. Otherwise, it won't be able to return the max of the date at the end of the function.

# Create directories for storing posts
directory = os.path.join('posts', '{:%Y}'.format(item_date), '{:%m}'.format(item_date))
if not os.path.exists(directory):
os.makedirs(directory)

filename = '{date:%Y%m%d_%H%M}_{student}.rst'.format(date=item_date, student=student)
fullcontent = ""

# Fetch content, either HTML or plain text
try:
html = "html" in item['content'][0]['type']
content = item['content'][0]['value']
except KeyError:
html = "html" in item['summary_detail']['type']
content = item['summary']
content = item['summary']

try:
# Convert the content to rst with correct handling of HTML
fullcontent = html2rst_allign_post(content) if html else strip_tags(content)
except ValueError as e:
# post contains no text
dates.pop() # removing the last added date for the empty post
# Post contains no text
dates.pop() # Removing the last added date for empty post
print("#################### Empty post!")
continue

with open(os.path.join(directory, filename), 'w') as post:
# some posts have an empty title, taking the first 30 characters.
title_post = item['title'] if item['title'] != '' else strip_tags(item['summary'])[:30]+'...'
author = item.get('author_detail', {'name': student}) # Not everyone got their author name in their blog :(
# Write post to file
with open(os.path.join(directory, filename), 'w', encoding="utf-8") as post:
title_post = item['title'] if item['title'] != '' else strip_tags(item['summary'])[:30] + '...'
author = item.get('author_detail', {'name': student}) # Fallback to student's name if author is missing
summary = remove_indent(strip_tags(item['summary'][:300]))
post.write(TEMPLATE.format(title=title_post,
date=item_date,
Expand All @@ -105,33 +138,37 @@ def grab_student(last_date, rss_url, project, student, season):
link=item['link'],
category=season,
post=fullcontent,
summary=summary,
))
return(max(dates))
summary=summary))
return max(dates)


# Load student times from YAML file
with open('gsoc_times.yml', 'r') as file_times:
levels = yaml.load_all(file_times, Loader=yaml.BaseLoader)
for level in levels:
students_times = level

# Load student RSS feed details from YAML file
with open('gsoc.yml', 'r') as stream:
list_seasons = yaml.load(stream, Loader=yaml.BaseLoader)
for season, list_students in list_seasons.items():
yearseason = int(season[4:])
if yearseason < dt.datetime.utcnow().year:
continue
for student, propers in list_students.items():
print(student, ':', propers['rss_feed'])
print(student, ':', propers['project'])
print(f"{student} : {propers['rss_feed']}")
print(f"{student} : {propers['project']}")

# Update the last post date
students_times[student] = dt.datetime.strptime(students_times[student], '%Y-%m-%d %H:%M:%S')

# Grab new posts
students_times[student] = grab_student(students_times[student],
propers['rss_feed'],
propers['project'],
student,
season)


# Save the updated student times back to YAML
with open('gsoc_times.yml', 'w') as file_times:
file_times.write(yaml.dump(students_times, default_flow_style=False))
# write page of students with css time_ranges