lbdstreaming/letterboxd_streaming.py at main · Bontrey/lbdstreaming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#!/usr/bin/env python3
"""
Script to scrape top 12 popular films from Letterboxd and check streaming availability
directly from Letterboxd's "Where to watch" section on each film's page.

Uses Selenium to handle JavaScript-rendered content.
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
from tabulate import tabulate
import time
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# Cache file to store streaming info
CACHE_FILE = 'streaming_cache.json'


def load_cache():
    """Load cached streaming info from file."""
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, 'r') as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError):
            return {}
    return {}


def save_cache(cache):
    """Save streaming info cache to file."""
    try:
        with open(CACHE_FILE, 'w') as f:
            json.dump(cache, f, indent=2)
    except IOError as e:
        print(f"Warning: Could not save cache: {e}")


def setup_driver():
    """Set up a headless Chrome driver."""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    try:
        driver = webdriver.Chrome(options=chrome_options)
        return driver
    except Exception as e:
        print(f"Error setting up Chrome driver: {e}")
        print("Make sure you have Chrome and chromedriver installed.")
        print("You can install chromedriver via: brew install chromedriver (macOS)")
        return None


def scrape_letterboxd_popular(driver):
    """Scrape the first 12 popular films from Letterboxd with their URLs."""
    url = "https://letterboxd.com/films/popular/this/week/"

    print(f"Fetching popular films from {url}...")
    driver.get(url)

    # Wait for the film posters to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li.poster-container, ul.poster-list li"))
        )
    except TimeoutException:
        print("Timeout waiting for films to load")
        return []

    # Give it a moment for all content to render
    time.sleep(2)

    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Try different selectors to find film containers
    film_containers = soup.select('li.poster-container')
    if not film_containers:
        film_containers = soup.select('ul.poster-list li')
    if not film_containers:
        film_containers = soup.select('li.listitem')

    films = []
    for container in film_containers[:12]:
        # Get the film poster div which contains the link
        poster_div = container.find('div', attrs={'data-film-slug': True})

        if not poster_div:
            # Try alternative approach - look for anchor tag
            anchor = container.find('a', href=True)
            if anchor and '/film/' in anchor['href']:
                film_slug = anchor['href'].replace('/film/', '').rstrip('/')
                poster_div = {'data-film-slug': film_slug}

                # Get title from img alt
                img = container.find('img')
                if img and img.get('alt'):
                    title = img['alt'].replace('Poster for ', '')
                else:
                    title = film_slug.replace('-', ' ').title()
            else:
                continue
        else:
            film_slug = poster_div.get('data-film-slug')
            img = poster_div.find('img')
            title = img.get('alt', film_slug.replace('-', ' ').title()).replace('Poster for ', '') if img else film_slug.replace('-', ' ').title()

        if film_slug:
            film_url = f"https://letterboxd.com/film/{film_slug}/"
            films.append({
                'title': title,
                'url': film_url
            })

    return films


def scrape_film_info(driver, film_url):
    """Scrape streaming info and rating from a film's Letterboxd page."""
    try:
        driver.get(film_url)

        # Wait for page to load - increased wait time
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract rating from JSON-LD structured data
        rating = None
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        for script in json_ld_scripts:
            try:
                content = script.string if script.string else script.get_text()
                if content:
                    # Remove CDATA comments if present
                    content = content.strip()
                    if content.startswith('/* <![CDATA['):
                        content = content.replace('/* <![CDATA[ */', '', 1)
                        content = content.replace('/* ]]> */', '', 1)
                    content = content.strip()

                    data = json.loads(content)
                    if 'aggregateRating' in data:
                        rating = data['aggregateRating'].get('ratingValue')
                        break
            except (json.JSONDecodeError, AttributeError, TypeError):
                continue

        # Extract streaming info
        streaming_info = "No streaming info available"

        # Look for the div with id="watch"
        watch_div = soup.find('div', id='watch')

        if watch_div:
            # Look for the services section
            services_section = watch_div.find('section', class_='services')

            if services_section:
                # Find all service paragraphs
                service_paragraphs = services_section.find_all('p', class_='service')

                if service_paragraphs:
                    services = []
                    for service_p in service_paragraphs:
                        # Extract service name from the class
                        classes = service_p.get('class', [])
                        for cls in classes:
                            if cls.startswith('-') and cls != '-showmore':
                                # Clean up the service name
                                service_name = cls[1:].replace('-', ' ').title()
                                services.append(service_name)
                                break

                    if services:
                        # Remove duplicates while preserving order
                        unique_services = []
                        for s in services:
                            if s not in unique_services:
                                unique_services.append(s)
                        streaming_info = ', '.join(unique_services)
            else:
                # Check if there's any content in the watch div
                # Sometimes films only have a message like "Watch it now"
                text = watch_div.get_text(strip=True)
                if text and 'trailer' not in text.lower():
                    # Extract meaningful text (not just "Where to watch")
                    lines = [line.strip() for line in text.split('\n') if line.strip()]
                    filtered_lines = [line for line in lines if line.lower() not in ['where to watch', 'trailer']]
                    if filtered_lines:
                        streaming_info = filtered_lines[0][:100]

        return {
            'streaming': streaming_info,
            'rating': rating
        }

    except Exception as e:
        return {
            'streaming': f"Error: {str(e)}",
            'rating': None
        }


def scrape_film_worker(film_index, film):
    """Worker function to scrape a single film's info with its own driver."""
    driver = setup_driver()
    if not driver:
        return {
            'index': film_index,
            'title': film['title'],
            'url': film['url'],
            'streaming': "Error: Could not create driver",
            'rating': None,
            'cached': False
        }

    try:
        film_info = scrape_film_info(driver, film['url'])
        return {
            'index': film_index,
            'title': film['title'],
            'url': film['url'],
            'streaming': film_info['streaming'],
            'rating': film_info['rating'],
            'cached': False
        }
    finally:
        driver.quit()


def main():
    print("=" * 70)
    print("LETTERBOXD POPULAR FILMS - STREAMING AVAILABILITY")
    print("=" * 70)
    print()

    # Load cache
    cache = load_cache()
    cache_size = len(cache)
    print(f"Loaded cache with {cache_size} film(s)\n")

    # Set up the Selenium driver
    driver = setup_driver()
    if not driver:
        return

    try:
        # Scrape Letterboxd popular films
        try:
            films = scrape_letterboxd_popular(driver)
            print(f"Found {len(films)} films\n")

            if len(films) == 0:
                print("No films found. The page structure may have changed.")
                print("Please check the Letterboxd website manually.")
                return
        except Exception as e:
            print(f"Error scraping Letterboxd: {e}")
            return

        # Separate cached and non-cached films
        cached_results = []
        films_to_fetch = []

        for i, film in enumerate(films):
            if film['url'] in cache:
                cached_data = cache[film['url']]
                cached_results.append({
                    'index': i + 1,
                    'title': film['title'],
                    'url': film['url'],
                    'streaming': cached_data.get('streaming', 'No streaming info available'),
                    'rating': cached_data.get('rating'),
                    'cached': True
                })
                print(f"⚡ Cached: {film['title']}")
            else:
                films_to_fetch.append((i + 1, film))

        print(f"\nFetching {len(films_to_fetch)} film(s) with max 3 concurrent requests...\n")

        # Fetch non-cached films in parallel
        fetched_results = []
        if films_to_fetch:
            with ThreadPoolExecutor(max_workers=3) as executor:
                # Submit only non-cached films
                futures = {
                    executor.submit(scrape_film_worker, film_index, film): (film_index, film)
                    for film_index, film in films_to_fetch
                }

                # Collect results as they complete
                for future in as_completed(futures):
                    try:
                        result = future.result()
                        fetched_results.append(result)

                        streaming = result['streaming']
                        # Only cache if we found actual streaming services (not errors or "no info")
                        should_cache = (
                            streaming and
                            not streaming.startswith('Not streaming') and
                            not streaming.startswith('Error:')
                        )

                        if should_cache:
                            cache[result['url']] = {
                                'title': result['title'],
                                'streaming': result['streaming'],
                                'rating': result['rating']
                            }
                            save_cache(cache)
                            print(f"✓ Fetched & Cached: {result['title']}")
                        else:
                            print(f"✓ Fetched (not cached): {result['title']}")
                    except Exception as e:
                        film_index, film = futures[future]
                        print(f"✗ Error fetching {film['title']}: {e}")
                        fetched_results.append({
                            'index': film_index,
                            'title': film['title'],
                            'url': film['url'],
                            'streaming': f"Error: {str(e)}",
                            'rating': None,
                            'cached': False
                        })

        # Combine cached and fetched results
        results = cached_results + fetched_results

        # Sort results by original index and print summary
        results.sort(key=lambda x: x['index'])
        print("\n" + "=" * 100)
        print("RESULTS")
        print("=" * 100 + "\n")

        cached_count = sum(1 for r in results if r.get('cached', False))
        fetched_count = len(results) - cached_count
        print(f"Summary: {cached_count} from cache, {fetched_count} newly fetched\n")

        # Create table data
        table_data = []
        for result in results:
            # Clean up streaming info
            streaming = result['streaming']
            if streaming.startswith('Not streaming'):
                streaming = 'Not streaming'

            # Format rating
            rating = result.get('rating')
            if rating is not None:
                rating_str = f"{rating:.1f}"
            else:
                rating_str = "N/A"

            table_data.append([
                result['title'][:40] + '...' if len(result['title']) > 40 else result['title'],
                rating_str,
                streaming[:50] + '...' if len(streaming) > 50 else streaming
            ])

        # Print table with headers
        headers = ["Film", "Rating", "Streaming Availability"]
        print(tabulate(table_data, headers=headers, tablefmt="grid"))
        print("\n" + "=" * 100)

    finally:
        # Clean up
        driver.quit()


if __name__ == "__main__":
    main()