-
-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathsecurity_utils.py
More file actions
225 lines (177 loc) · 6.99 KB
/
Copy pathsecurity_utils.py
File metadata and controls
225 lines (177 loc) · 6.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""Security helpers for URLs and small API hardening concerns."""
from __future__ import annotations
import ipaddress
import os
import posixpath
import socket
from typing import Iterable, List, Optional, Set
from urllib.parse import urljoin, urlparse
import requests
class UnsafeURL(ValueError):
"""Raised when a user supplied URL should not be fetched."""
def env_bool(name: str, default: bool = False) -> bool:
value = os.getenv(name)
if value is None:
return default
return value.strip().lower() in {"1", "true", "yes", "on"}
def parse_csv_env(name: str, *, lower: bool = True) -> Set[str]:
raw = os.getenv(name, "")
values = set()
for item in raw.split(","):
clean = item.strip()
if not clean:
continue
values.add(clean.lower() if lower else clean)
return values
def _host_matches(host: str, patterns: Iterable[str]) -> bool:
host = host.lower().rstrip(".")
for pattern in patterns:
pattern = pattern.lower().rstrip(".")
if not pattern:
continue
if pattern.startswith("*.") and host.endswith(pattern[1:]):
return True
if host == pattern:
return True
return False
def _is_public_ip(address: str) -> bool:
ip = ipaddress.ip_address(address)
return ip.is_global
def _resolve_host(hostname: str, port: Optional[int]) -> Set[str]:
addresses: Set[str] = set()
infos = socket.getaddrinfo(hostname, port or 443, type=socket.SOCK_STREAM)
for info in infos:
sockaddr = info[4]
if sockaddr:
addresses.add(sockaddr[0])
return addresses
def validate_fetch_url(url: str, *, purpose: str = "fetch") -> str:
"""Validate that a user supplied URL is safe to fetch.
By default, only public http/https URLs are allowed. Private network targets
can be enabled for trusted local deployments with ALLOW_PRIVATE_CRAWL_URLS=1
or per-host with CRAWL_ALLOWED_HOSTS=example.com,*.example.org.
"""
if not isinstance(url, str) or not url.strip():
raise UnsafeURL(f"{purpose} URL is required")
clean_url = url.strip()
parsed = urlparse(clean_url)
if parsed.scheme not in {"http", "https"}:
raise UnsafeURL(f"{purpose} URL must use http or https")
if not parsed.hostname:
raise UnsafeURL(f"{purpose} URL must include a hostname")
if parsed.username or parsed.password:
raise UnsafeURL(f"{purpose} URL must not include credentials")
hostname = parsed.hostname.lower().rstrip(".")
allowed_hosts = parse_csv_env("CRAWL_ALLOWED_HOSTS")
if _host_matches(hostname, allowed_hosts):
return clean_url
if env_bool("ALLOW_PRIVATE_CRAWL_URLS", default=False):
return clean_url
try:
addresses = {hostname} if _looks_like_ip(hostname) else _resolve_host(hostname, parsed.port)
except Exception as exc:
raise UnsafeURL(f"Could not resolve {purpose} hostname '{hostname}': {exc}") from exc
if not addresses:
raise UnsafeURL(f"Could not resolve {purpose} hostname '{hostname}'")
unsafe = []
for address in addresses:
try:
if not _is_public_ip(address):
unsafe.append(address)
except ValueError:
unsafe.append(address)
if unsafe:
raise UnsafeURL(
f"Blocked {purpose} URL '{hostname}' because it resolves to non-public address(es): "
f"{', '.join(sorted(unsafe))}. Set ALLOW_PRIVATE_CRAWL_URLS=1 or CRAWL_ALLOWED_HOSTS to allow it."
)
return clean_url
def _looks_like_ip(hostname: str) -> bool:
try:
ipaddress.ip_address(hostname)
return True
except ValueError:
return False
def same_hostname(url: str, other_url: str) -> bool:
"""Return True when two URLs share the same normalized hostname."""
left = (urlparse(url).hostname or "").lower().rstrip(".")
right = (urlparse(other_url).hostname or "").lower().rstrip(".")
return bool(left and right and left == right)
def safe_join_url(base_url: str, candidate: str, *, purpose: str = "fetch") -> str:
"""Resolve a possibly-relative URL against *base_url* and validate the result."""
joined = urljoin(base_url, (candidate or "").strip())
return validate_fetch_url(joined, purpose=purpose)
def fetch_validated_url(
url: str,
*,
purpose: str = "fetch",
timeout: int = 30,
max_redirects: int = 5,
headers: Optional[dict] = None,
) -> requests.Response:
"""Fetch a URL while validating every redirect hop.
``requests`` follows redirects automatically by default, which can turn a
public URL into a private-network fetch after the initial validation. This
helper disables automatic redirects, validates each Location target, and
only then continues.
"""
current_url = validate_fetch_url(url, purpose=purpose)
session = requests.Session()
for _ in range(max_redirects + 1):
response = session.get(
current_url,
timeout=timeout,
headers=headers,
allow_redirects=False,
)
if not response.is_redirect:
response.url = current_url
return response
location = response.headers.get("Location")
if not location:
raise UnsafeURL(f"{purpose} redirect did not include a Location header")
current_url = safe_join_url(current_url, location, purpose=f"{purpose} redirect")
raise UnsafeURL(f"{purpose} exceeded redirect limit ({max_redirects})")
def filter_safe_crawl_urls(
urls: Iterable[str],
*,
source_url: Optional[str] = None,
purpose: str = "crawl",
allow_external_hosts: bool = False,
) -> List[str]:
"""Validate and deduplicate URLs before they are passed to the crawler.
If ``source_url`` is provided, relative URLs are resolved against it. When
``allow_external_hosts`` is false, URLs whose host differs from the source
host are skipped. Skipped URLs are intentionally silent at this layer so
callers can decide how noisy logs should be.
"""
safe: List[str] = []
seen: Set[str] = set()
for raw in urls:
if not raw:
continue
try:
candidate = (
safe_join_url(source_url, raw, purpose=purpose)
if source_url
else validate_fetch_url(str(raw), purpose=purpose)
)
except UnsafeURL:
continue
parsed = urlparse(candidate)
clean_path = posixpath.normpath(parsed.path or "/")
if parsed.path.endswith("/") and not clean_path.endswith("/"):
clean_path += "/"
normalized = parsed._replace(
scheme=parsed.scheme.lower(),
netloc=parsed.netloc.lower(),
path=clean_path,
fragment="",
).geturl()
if source_url and not allow_external_hosts and not same_hostname(source_url, normalized):
continue
if normalized in seen:
continue
seen.add(normalized)
safe.append(normalized)
return safe