Skip to content

Commit 65ef050

Browse files
feederbox826Copilot
andcommitted
[py_common/proxy] backend-based workers
Co-authored-by: Copilot <copilot@github.com>
1 parent ef3eb38 commit 65ef050

1 file changed

Lines changed: 129 additions & 71 deletions

File tree

scrapers/py_common/proxy.py

Lines changed: 129 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,94 @@
2121
import requests
2222
import cloudscraper
2323

24-
def _is_blocked(res):
25-
if res.status_code not in (403, 503):
26-
return False
27-
body = res.text[:4096].lower()
28-
return any(marker in body for marker in (
29-
"cloudflare", "cf-ray", "cf-chl", "challenge-platform",
30-
"just a moment", "attention required",
31-
))
24+
class RequestBackend:
25+
name = "backend"
26+
27+
@staticmethod
28+
def is_blocked(res):
29+
if res.status_code not in (403, 503):
30+
return False
31+
body = res.text[:4096].lower()
32+
return any(marker in body for marker in (
33+
"cloudflare", "cf-ray", "cf-chl", "challenge-platform",
34+
"just a moment", "attention required",
35+
))
36+
37+
def request(self, method, url, **kwargs):
38+
raise NotImplementedError
39+
40+
class RequestsBackend(RequestBackend):
41+
name = "requests"
42+
43+
def __init__(self, proxies=None, useragent=None):
44+
self.session = requests.Session()
45+
if proxies:
46+
self.session.proxies = proxies
47+
if useragent == "inherit":
48+
ua = get_useragent()
49+
self.session.headers.update({"User-Agent": ua})
50+
elif useragent:
51+
self.session.headers.update({"User-Agent": useragent})
52+
53+
def _apply_cache(self, url):
54+
cache_entry = cookie_cache.get(url)
55+
if cache_entry:
56+
log.debug(f"[proxy] Using cache for {url}")
57+
for cookie in cache_entry['cookies']:
58+
self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path'])
59+
if cache_entry['useragent']:
60+
self.session.headers.update({"User-Agent": cache_entry['useragent']})
61+
return bool(cache_entry)
62+
63+
def request(self, method, url, **kwargs):
64+
self._apply_cache(url)
65+
try:
66+
res = self.session.request(method, url, **kwargs)
67+
if not self.is_blocked(res):
68+
return res
69+
log.warning(f"[proxy] requests blocked ({res.status_code}).")
70+
except requests.exceptions.RequestException as e:
71+
log.warning(f"[proxy] Request failed: {e}.")
72+
raise Exception("Requests backend failed")
73+
74+
class CloudscraperBackend(RequestBackend):
75+
name = "cloudscraper"
76+
77+
def __init__(self, proxies=None):
78+
self.scraper = cloudscraper.create_scraper()
79+
if proxies:
80+
self.scraper.proxies = proxies
81+
82+
def request(self, method, url, **kwargs):
83+
try:
84+
res = self.scraper.request(method, url, **kwargs)
85+
if not self.is_blocked(res):
86+
return res
87+
log.warning(f"[proxy] Cloudscraper blocked ({res.status_code}).")
88+
except requests.exceptions.RequestException as e:
89+
log.warning(f"[proxy] Cloudscraper request failed: {e}")
90+
raise Exception("Cloudscraper backend failed")
91+
92+
class FlareSolverrBackend(RequestBackend):
93+
name = "flaresolverr"
94+
95+
def request(self, method, url, **kwargs):
96+
if check_flaresolverr(FLARESOLVERR_URL):
97+
log.info(f"[proxy] trying FlareSolverr for {url}")
98+
# HEAD is not supported
99+
if method == "head":
100+
method = "get"
101+
log.warning("[proxy] HEAD not supported by FlareSolverr, using GET instead")
102+
try:
103+
post_data = (kwargs.get("json") or kwargs.get("data")) if method == "post" else None
104+
return flaresolverr_req(url, method=method, postData=post_data, proxy=PROXY_URL)
105+
except Exception as e:
106+
log.warning(f"[proxy] FlareSolverr request failed: {e}")
107+
else:
108+
raise Exception("FlareSolverr not detected")
109+
raise Exception("FlareSolverr backend failed")
110+
111+
## END REWRITE
32112

33113
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://localhost:8191/v1")
34114

@@ -48,7 +128,7 @@ def check_flaresolverr(url):
48128

49129
@cache_to_disk(ttl=86400)
50130
def get_useragent() -> str:
51-
chrome_ua = requests.get("https://jnrbsn.github.io/user-agents/user-agents.json").json()[3]
131+
chrome_ua = requests.get("https://feederbox826.github.io/user-agents/user-agents.json").json()[3]
52132
return chrome_ua
53133

54134
def flaresolverr_req(url, method="get", postData=None, proxy=None) -> requests.Response:
@@ -107,75 +187,53 @@ def update(self):
107187

108188
cookie_cache = CookieCache()
109189

190+
class BackendManager:
191+
def __init__(self, backends=None):
192+
proxies = { "http": PROXY_URL, "https": PROXY_URL } if PROXY_URL else {}
193+
if not backends:
194+
self.backends = [
195+
RequestsBackend(proxies=proxies, useragent="inherit"),
196+
CloudscraperBackend(proxies=proxies),
197+
]
198+
if check_flaresolverr(FLARESOLVERR_URL):
199+
self.backends.append(FlareSolverrBackend())
200+
else:
201+
# populate backends based on name
202+
backend_classes = {
203+
"requests": RequestsBackend,
204+
"cloudscraper": CloudscraperBackend,
205+
"flaresolverr": FlareSolverrBackend,
206+
}
207+
self.backends = []
208+
for name in backends:
209+
cls = backend_classes.get(name)
210+
if cls:
211+
self.backends.append(cls(proxies=proxies))
212+
else:
213+
log.warning(f"[proxy] Unknown backend specified: {name}")
214+
215+
def request(self, method, url, **kwargs):
216+
for backend in self.backends:
217+
try:
218+
return backend.request(method, url, **kwargs)
219+
except Exception as e:
220+
log.debug(f"[proxy] {backend.name} failed: {e}")
221+
raise Exception("All backends failed")
222+
110223
class StashRequests:
111-
def __init__(self, cloudflare=False, useragent="inherit"):
112-
self.session = requests.Session()
224+
def __init__ (self, cloudflare=False, useragent="inherit"):
113225
self.proxies = { "http": PROXY_URL, "https": PROXY_URL } if PROXY_URL else {}
114-
self.session.proxies = self.proxies
115-
if useragent == "inherit":
116-
ua = get_useragent()
117-
self.session.headers.update({"User-Agent": ua})
118-
elif useragent:
119-
self.session.headers.update({"User-Agent": useragent})
120226
self.cloudflare = cloudflare
121-
self._cloudscraper = None
122-
123-
def _apply_cache(self, url):
124-
cache_entry = cookie_cache.get(url)
125-
if cache_entry:
126-
log.debug(f"[proxy] Using cache for {url}")
127-
for cookie in cache_entry['cookies']:
128-
self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path'])
129-
if cache_entry['useragent']:
130-
self.session.headers.update({"User-Agent": cache_entry['useragent']})
131-
return bool(cache_entry)
132-
133-
def get_cloudscraper(self):
134-
if not self._cloudscraper:
135-
self._cloudscraper = cloudscraper.create_scraper()
136-
self._cloudscraper.proxies = self.proxies
137-
return self._cloudscraper
138-
139-
def _request(self, method, url, **kwargs):
140-
has_cache = self._apply_cache(url)
141-
if has_cache or not self.cloudflare:
142-
try:
143-
res = self.session.request(method, url, **kwargs)
144-
if not _is_blocked(res):
145-
return res
146-
log.warning(f"[proxy] requests blocked ({res.status_code}). Trying cloudscraper.")
147-
self.cloudflare = True
148-
except requests.exceptions.RequestException as e:
149-
log.warning(f"[proxy] Request failed: {e}. Trying cloudscraper.")
150-
self.cloudflare = True
151-
try:
152-
scraper = self.get_cloudscraper()
153-
res = scraper.request(method, url, **kwargs)
154-
if not _is_blocked(res):
155-
return res
156-
log.warning(f"[proxy] Cloudscraper blocked ({res.status_code}).")
157-
except requests.exceptions.RequestException as e:
158-
log.warning(f"[proxy] Cloudscraper request failed: {e}")
159-
if check_flaresolverr(FLARESOLVERR_URL):
160-
log.info(f"[proxy] trying FlareSolverr for {url}")
161-
# HEAD is not supported
162-
if method == "head":
163-
method = "get"
164-
log.warning("[proxy] HEAD not supported by FlareSolverr, using GET instead")
165-
try:
166-
post_data = (kwargs.get("json") or kwargs.get("data")) if method == "post" else None
167-
return flaresolverr_req(url, method=method, postData=post_data, proxy=PROXY_URL)
168-
except Exception as e:
169-
log.warning(f"[proxy] FlareSolverr request failed: {e}")
170-
raise Exception("All request methods failed")
227+
self.useragent = useragent
228+
self.manager = BackendManager()
171229

172230
def get(self, url, **kwargs):
173-
return self._request("get", url, **kwargs)
231+
return self.manager.request("get", url, **kwargs)
174232

175233
def post(self, url, **kwargs):
176-
return self._request("post", url, **kwargs)
177-
234+
return self.manager.request("post", url, **kwargs)
235+
178236
def head(self, url, **kwargs):
179-
return self._request("head", url, **kwargs)
237+
return self.manager.request("head", url, **kwargs)
180238

181239
stash_requests = StashRequests()

0 commit comments

Comments
 (0)