-
-
Notifications
You must be signed in to change notification settings - Fork 352
Expand file tree
/
Copy path.env.example
More file actions
152 lines (126 loc) · 5.24 KB
/
Copy path.env.example
File metadata and controls
152 lines (126 loc) · 5.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
ANYCRAWL_NAME=AnyCrawl
ANYCRAWL_DOMAIN=http://localhost:8080
ANYCRAWL_API_PORT=8080
ANYCRAWL_HEADLESS=true
ANYCRAWL_PROXY_URL=
ANYCRAWL_PROXY_STEALTH_URL=
ANYCRAWL_PROXY_STEALTH_CREDITS=5
# Timeout for stealth proxy mode (default: 120s)
ANYCRAWL_STEALTH_TIMEOUT_MS=120000
# Timeout for base proxy mode (default: 60s)
ANYCRAWL_BASE_TIMEOUT_MS=60000
# Cloudflare Turnstile solver (stealth proxy mode only)
ANYCRAWL_2CAPTCHA_API_KEY=
ANYCRAWL_2CAPTCHA_API_BASE=https://api.2captcha.com
ANYCRAWL_2CAPTCHA_TIMEOUT_MS=60000
ANYCRAWL_2CAPTCHA_MAX_RETRIES=3
ANYCRAWL_PROXY_CONFIG=
ANYCRAWL_KEEP_ALIVE=true
# Backward-compatible alias for older deployments. Prefer ANYCRAWL_KEEP_ALIVE.
ANYCRAWL_KEEPALIVE=true
ANYCRAWL_AVAILABLE_ENGINES=playwright,cheerio,puppeteer
ANYCRAWL_IGNORE_SSL_ERROR=true
ANYCRAWL_API_AUTH_ENABLED=false
ANYCRAWL_API_CREDITS_ENABLED=false
ANYCRAWL_API_DB_TYPE=sqlite
ANYCRAWL_API_DB_CONNECTION=${PWD}/database.db
ANYCRAWL_REDIS_URL=redis://redis:6379
# Do not configure unless you know what you're doing.
ANYCRAWL_MAX_CONCURRENCY=50
ANYCRAWL_MIN_CONCURRENCY=50
ANYCRAWL_BROWSER_IDLE_RETIRE_SECS=3600
ANYCRAWL_BROWSER_MAX_PAGES_PER_BROWSER=500
ANYCRAWL_BROWSER_MAX_OPEN_PAGES_PER_BROWSER=20
ANYCRAWL_BROWSER_ISOLATE_CONTEXTS=true
# Browser engine runtime for Playwright/Puppeteer uses CloakBrowser by default.
# Docker images pre-install the binary at build time. For local/self-hosted
# deployments, set a stable cache directory and disable runtime auto-updates.
CLOAKBROWSER_CACHE_DIR=${PWD}/.cache/cloakbrowser
CLOAKBROWSER_AUTO_UPDATE=false
# Optional: use a pre-installed CloakBrowser/Chromium binary and skip downloads.
CLOAKBROWSER_BINARY_PATH=
# if not configured, will use the browser us
ANYCRAWL_USER_AGENT=AnyCrawl/0.1 (+https://github.com/any4ai/AnyCrawl)
ANYCRAWL_NAME_KEY_VALUE_STORE=AnyCrawl
ANYCRAWL_LOCAL_STORAGE_DIR=${PWD}/storage
# Optional: keep Crawlee runtime queues/session state separate from public local files.
# Use a process/container-local path when multiple workers share ANYCRAWL_LOCAL_STORAGE_DIR.
ANYCRAWL_CRAWLEE_STORAGE_DIR=
# Storage backend: set to "s3" to enable S3 storage (files/screenshots) and page cache.
# Any other value (or empty) uses local/no-op storage depending on feature.
ANYCRAWL_STORAGE=
ANYCRAWL_S3_BUCKET=
ANYCRAWL_S3_REGION=us-east-1
ANYCRAWL_S3_ENDPOINT=
ANYCRAWL_S3_ACCESS_KEY=
ANYCRAWL_S3_SECRET_ACCESS_KEY=
# Cache (map cache works with DB; page cache requires ANYCRAWL_STORAGE=s3)
# Set to "false" to disable all caching.
ANYCRAWL_CACHE_ENABLED=true
# Default page cache max age (ms). Default: 2 days.
ANYCRAWL_CACHE_DEFAULT_MAX_AGE=172800000
# Sitemap/map cache max age (ms). Default: 7 days.
ANYCRAWL_CACHE_SITEMAP_MAX_AGE=604800000
# Optional: use a separate bucket/prefix for cache objects (defaults to ANYCRAWL_S3_BUCKET).
# - Only affects page cache objects stored in S3 (map cache is stored in DB).
# - Leave empty to use ANYCRAWL_S3_BUCKET.
# - If set, ensure the bucket is reachable with the same S3 endpoint/region/credentials.
ANYCRAWL_S3_CACHE_BUCKET=
# Folder prefix inside the cache bucket. Default: "cache/"
ANYCRAWL_S3_CACHE_PREFIX=cache/
# AI feature
# ANYCRAWL_AI_CONFIG_PATH=
# if set ANYCRAWL_AI_CONFIG_PATH, the coming ai env will be disabled.
# AI for extracting
# format: openai/gpt-40
# Connect the provider name and the model name with a "/", such as openai:gpt-4o-mini, openrouter:openai/gpt-4o-mini, custom/glm-4.5
DEFAULT_LLM_MODEL=
DEFAULT_EXTRACT_MODEL=
DEFAULT_EMBEDDING_MODEL=
# Support configuring many providers, and decide by model name.
# Provide your OpenAI API key here to enable AI features
# OPENAI_API_KEY=
# OpenRouter
# OPENROUTER_API_KEY=
# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
# Atlas Cloud (OpenAI-compatible)
ATLASCLOUD_BASE_URL=https://api.atlascloud.ai/v1
ATLASCLOUD_API_KEY=db38a9cf8f6e4ea992dfbf2ee4a5aee1
# Example:
# DEFAULT_LLM_MODEL=atlascloud/deepseek-v3
# DEFAULT_EXTRACT_MODEL=atlascloud/deepseek-v3
# OpenAI-compatible API, the provider name is: "custom"
# CUSTOM_BASE_URL=https://example.com/v1
# CUSTOM_API_KEY=
# OCR (OpenAI-compatible Vision/OCR provider)
# Example:
# ANYCRAWL_VL_REC_SERVER_URL=https://api.siliconflow.cn/v1
# ANYCRAWL_VL_REC_API_KEY=your-key
ANYCRAWL_VL_REC_SERVER_URL=
ANYCRAWL_VL_REC_API_KEY=
ANYCRAWL_VL_REC_MODEL=PaddlePaddle/PaddleOCR-VL-1.5
ANYCRAWL_VL_REC_PROVIDER_NAME=vlRec
ANYCRAWL_VL_REC_MAX_OUTPUT_TOKENS=2048
ANYCRAWL_VL_REC_TIMEOUT_MS=30000
ANYCRAWL_VL_REC_MIN_PIXELS=112896
ANYCRAWL_VL_REC_MAX_PIXELS=1003520
# OCR concurrent image requests per page (default: 5)
ANYCRAWL_OCR_CONCURRENCY=5
ANYCRAWL_EXTRACT_JSON_CREDITS=5
ANYCRAWL_SUMMARY_CREDITS=0
ANYCRAWL_TEMPLATE_EXECUTION_TIMEOUT=600_000
ANYCRAWL_REQUEST_HANDLER_TIMEOUT_SECS=600
ANYCRAWL_SEARCH_DEFAULT_ENGINE=ac-engine
ANYCRAWL_SEARCH_ENABLED_ENGINES=ac-engine,searxng
ANYCRAWL_AC_ENGINE_URL=
ANYCRAWL_SEARXNG_URL=
ANYCRAWL_TEMPLATE_CACHE_TTL_MS=0
ANYCRAWL_SCHEDULER_ENABLED=true
ANYCRAWL_SCHEDULER_SYNC_INTERVAL_MS=10000 # Polling interval to detect new tasks (default: 10 seconds)
# Scheduled Tasks Limits (disabled by default for open-source)
ANYCRAWL_SCHEDULED_TASKS_LIMIT_ENABLED=false
ANYCRAWL_SCHEDULED_TASKS_LIMIT_FREE=1
ANYCRAWL_SCHEDULED_TASKS_LIMIT_PAID=100
ANYCRAWL_WEBHOOKS_ENABLED=true
ANYCRAWL_WEBHOOKS_QUEUE_CONCURRENCY=10
ALLOW_LOCAL_WEBHOOKS=false # Set to true only for testing