-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_scraper.py
More file actions
117 lines (92 loc) · 4.09 KB
/
Copy pathtest_scraper.py
File metadata and controls
117 lines (92 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""Test script to validate scraper with 50 reviews from Chase Mobile"""
import time
from pathlib import Path
from data_collection.scraper import PlayStoreScraper
from data_collection.config import BANKING_APPS, TEST_REVIEW_COUNT
def main():
"""Run a quick test scrape of Chase Mobile"""
print("=" * 60)
print("GOOGLE PLAY STORE SCRAPER - TEST RUN")
print("=" * 60)
print(f"\nTarget: {TEST_REVIEW_COUNT} reviews from Chase Mobile")
print("This will help validate:")
print(" 1. Review quality (truncation rate)")
print(" 2. Scrape speed (estimate time for full dataset)")
print(" 3. Schema validation")
print(" 4. Data distribution\n")
input("Press Enter to start scraping...")
# Initialize scraper
scraper = PlayStoreScraper()
# Test with just Chase Mobile
test_app_name = "Chase Mobile"
test_app_id = BANKING_APPS[test_app_name]
# Start timing
start_time = time.time()
# Scrape
reviews = scraper.scrape_app_reviews(
app_name=test_app_name,
app_id=test_app_id,
target_count=TEST_REVIEW_COUNT,
checkpoint_path=Path("data/raw/test_checkpoint.csv")
)
# End timing
duration = time.time() - start_time
# Save test results
output_path = Path("data/raw/test_reviews_chase.csv")
scraper.save_reviews(reviews, output_path)
# Analysis
print("\n" + "=" * 60)
print("TEST RESULTS")
print("=" * 60)
if len(reviews) > 0:
truncated_count = sum(1 for r in reviews if r.is_truncated)
truncation_rate = truncated_count / len(reviews)
avg_length = sum(len(r.review_text) for r in reviews) / len(reviews)
# Rating distribution
rating_counts = {}
for r in reviews:
rating_counts[r.star_rating] = rating_counts.get(r.star_rating, 0) + 1
print(f"\n✓ Reviews collected: {len(reviews)}/{TEST_REVIEW_COUNT}")
print(f"✓ Time taken: {duration:.1f} seconds ({duration/len(reviews):.2f}s per review)")
print(f"\nData Quality:")
print(f" - Truncation rate: {truncation_rate:.1%} ({truncated_count}/{len(reviews)})")
print(f" - Avg review length: {avg_length:.0f} characters")
print(f"\nRating Distribution:")
for rating in sorted(rating_counts.keys()):
count = rating_counts[rating]
pct = count / len(reviews) * 100
bar = "█" * int(pct / 2)
print(f" {rating} stars: {count:3d} ({pct:5.1f}%) {bar}")
# Time estimate for full dataset
reviews_needed = 5000 * 5 # 5 apps, 5K each
estimated_time = (duration / len(reviews)) * reviews_needed
print(f"\nEstimated time for full dataset ({reviews_needed:,} reviews):")
print(f" {estimated_time / 3600:.1f} hours")
# Quality warnings
print("\n" + "=" * 60)
if truncation_rate > 0.3:
print("⚠️ WARNING: High truncation rate (>30%)")
print(" Consider: This may limit feature extraction quality")
if avg_length < 50:
print("⚠️ WARNING: Short average review length (<50 chars)")
print(" Consider: Filter or oversample longer reviews")
# Check for class imbalance
one_star = rating_counts.get(1, 0)
five_star = rating_counts.get(5, 0)
total = len(reviews)
if (one_star + five_star) / total > 0.7:
print("⚠️ WARNING: Rating distribution skewed toward 1-star and 5-star")
print(" Consider: Oversample 2-4 star reviews for balance")
print("=" * 60)
print(f"\n✓ Test data saved to: {output_path}")
print("\nNext steps:")
print(" 1. Review the CSV file to inspect review quality")
print(" 2. If quality looks good, update config.py:")
print(" - Change TEST_REVIEW_COUNT to REVIEWS_PER_APP (5000)")
print(" 3. Run full scrape with all 5 banking apps")
print(" 4. Proceed to feature extraction with Phi-3/BERT")
else:
print("❌ No reviews collected. Check logs/scraper.log for errors")
print("=" * 60)
if __name__ == "__main__":
main()