Skip to content

Commit 5375bf4

Browse files
committed
feat: add fastCRW search connector
1 parent 3e53931 commit 5375bf4

10 files changed

Lines changed: 431 additions & 2 deletions

File tree

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Add CRW_API connector enum value
2+
3+
Revision ID: 160
4+
Revises: 159
5+
Create Date: 2026-06-13 00:00:00.000000
6+
7+
"""
8+
9+
from collections.abc import Sequence
10+
11+
from alembic import op
12+
13+
# revision identifiers, used by Alembic.
14+
revision: str = "160"
15+
down_revision: str | None = "159"
16+
branch_labels: str | Sequence[str] | None = None
17+
depends_on: str | Sequence[str] | None = None
18+
19+
20+
def upgrade() -> None:
21+
"""Safely add CRW_API to searchsourceconnectortype enum."""
22+
op.execute(
23+
"""
24+
DO $$
25+
BEGIN
26+
IF NOT EXISTS (
27+
SELECT 1 FROM pg_type t
28+
JOIN pg_enum e ON t.oid = e.enumtypid
29+
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'CRW_API'
30+
) THEN
31+
ALTER TYPE searchsourceconnectortype ADD VALUE 'CRW_API';
32+
END IF;
33+
END
34+
$$;
35+
"""
36+
)
37+
38+
39+
def downgrade() -> None:
40+
"""Downgrade not supported for enum edits."""
41+
pass

surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/runtime/connector_searchable_types.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@
1717

1818
# Maps SearchSourceConnectorType enum values to the searchable document/connector types
1919
# used by pre-search middleware and web_search.
20-
# Live search connectors (TAVILY_API, LINKUP_API, BAIDU_SEARCH_API) are routed to
21-
# the web_search tool; all others are considered local/indexed data.
20+
# Live search connectors (TAVILY_API, LINKUP_API, BAIDU_SEARCH_API, CRW_API) are
21+
# routed to the web_search tool; all others are considered local/indexed data.
2222
_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
2323
# Live search connectors (handled by web_search tool)
2424
"TAVILY_API": "TAVILY_API",
2525
"LINKUP_API": "LINKUP_API",
2626
"BAIDU_SEARCH_API": "BAIDU_SEARCH_API",
27+
"CRW_API": "CRW_API",
2728
# Local/indexed connectors (handled by KB pre-search middleware)
2829
"SLACK_CONNECTOR": "SLACK_CONNECTOR",
2930
"TEAMS_CONNECTOR": "TEAMS_CONNECTOR",

surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"TAVILY_API",
2828
"LINKUP_API",
2929
"BAIDU_SEARCH_API",
30+
"CRW_API",
3031
}
3132

3233
# Patterns that indicate the query has no meaningful search signal.
@@ -455,6 +456,7 @@ def format_documents_for_context(
455456
"TAVILY_API",
456457
"LINKUP_API",
457458
"BAIDU_SEARCH_API",
459+
"CRW_API",
458460
}
459461

460462
parts: list[str] = []

surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/web_search.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,21 @@
1616
"TAVILY_API",
1717
"LINKUP_API",
1818
"BAIDU_SEARCH_API",
19+
"CRW_API",
1920
}
2021

2122
_LIVE_CONNECTOR_SPECS: dict[str, tuple[str, bool, bool, dict[str, Any]]] = {
2223
"TAVILY_API": ("search_tavily", False, True, {}),
2324
"LINKUP_API": ("search_linkup", False, False, {"mode": "standard"}),
2425
"BAIDU_SEARCH_API": ("search_baidu", False, True, {}),
26+
"CRW_API": ("search_crw", False, True, {}),
2527
}
2628

2729
_CONNECTOR_LABELS: dict[str, str] = {
2830
"TAVILY_API": "Tavily",
2931
"LINKUP_API": "Linkup",
3032
"BAIDU_SEARCH_API": "Baidu",
33+
"CRW_API": "fastCRW",
3134
}
3235

3336

surfsense_backend/app/agents/chat/shared/tools/web_search.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,21 @@
2222
"TAVILY_API",
2323
"LINKUP_API",
2424
"BAIDU_SEARCH_API",
25+
"CRW_API",
2526
}
2627

2728
_LIVE_CONNECTOR_SPECS: dict[str, tuple[str, bool, bool, dict[str, Any]]] = {
2829
"TAVILY_API": ("search_tavily", False, True, {}),
2930
"LINKUP_API": ("search_linkup", False, False, {"mode": "standard"}),
3031
"BAIDU_SEARCH_API": ("search_baidu", False, True, {}),
32+
"CRW_API": ("search_crw", False, True, {}),
3133
}
3234

3335
_CONNECTOR_LABELS: dict[str, str] = {
3436
"TAVILY_API": "Tavily",
3537
"LINKUP_API": "Linkup",
3638
"BAIDU_SEARCH_API": "Baidu",
39+
"CRW_API": "fastCRW",
3740
}
3841

3942

surfsense_backend/app/db.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class SearchSourceConnectorType(StrEnum):
8585
SEARXNG_API = "SEARXNG_API"
8686
LINKUP_API = "LINKUP_API"
8787
BAIDU_SEARCH_API = "BAIDU_SEARCH_API" # Baidu AI Search API for Chinese web search
88+
CRW_API = "CRW_API" # fastCRW — Firecrawl-compatible web scraper; self-host or cloud
8889
SLACK_CONNECTOR = "SLACK_CONNECTOR"
8990
TEAMS_CONNECTOR = "TEAMS_CONNECTOR"
9091
ONEDRIVE_CONNECTOR = "ONEDRIVE_CONNECTOR"

surfsense_backend/app/services/ai_file_sort_service.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
SearchSourceConnectorType.SEARXNG_API: "SearXNG Search",
6262
SearchSourceConnectorType.LINKUP_API: "Linkup Search",
6363
SearchSourceConnectorType.BAIDU_SEARCH_API: "Baidu Search",
64+
SearchSourceConnectorType.CRW_API: "fastCRW Search",
6465
SearchSourceConnectorType.SLACK_CONNECTOR: "Slack",
6566
SearchSourceConnectorType.TEAMS_CONNECTOR: "Teams",
6667
SearchSourceConnectorType.ONEDRIVE_CONNECTOR: "OneDrive",

surfsense_backend/app/services/connector_service.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,186 @@ async def search_baidu(
835835

836836
return result_object, documents
837837

838+
async def search_crw(
839+
self,
840+
user_query: str,
841+
search_space_id: int,
842+
top_k: int = 20,
843+
) -> tuple:
844+
"""
845+
Search using fastCRW and return both sources and documents.
846+
847+
fastCRW is a Firecrawl-compatible web scraper (single binary; self-host
848+
or cloud). Results come from the ``POST /v1/search`` endpoint, which
849+
returns a ``{success, data: [{title, url, description, markdown?}]}``
850+
envelope.
851+
852+
Args:
853+
user_query: User's search query
854+
search_space_id: Search space ID
855+
top_k: Maximum number of results to return
856+
857+
Returns:
858+
tuple: (sources_info_dict, documents_list)
859+
"""
860+
# Get CRW connector configuration
861+
crw_connector = await self.get_connector_by_type(
862+
SearchSourceConnectorType.CRW_API, search_space_id
863+
)
864+
865+
if not crw_connector:
866+
return {
867+
"id": 13,
868+
"name": "fastCRW Search",
869+
"type": "CRW_API",
870+
"sources": [],
871+
}, []
872+
873+
config = crw_connector.config or {}
874+
api_key = config.get("CRW_API_KEY")
875+
876+
# Default to the managed cloud; allow self-host override via CRW_BASE_URL.
877+
base_url = (config.get("CRW_BASE_URL") or "https://fastcrw.com/api").rstrip("/")
878+
search_endpoint = f"{base_url}/v1/search"
879+
880+
# Bearer auth (self-host instances may run without auth → key optional).
881+
headers = {"Content-Type": "application/json"}
882+
if api_key:
883+
headers["Authorization"] = f"Bearer {api_key}"
884+
885+
payload = {
886+
"query": user_query,
887+
"limit": top_k,
888+
}
889+
890+
try:
891+
async with httpx.AsyncClient(timeout=90.0) as client:
892+
response = await client.post(
893+
search_endpoint,
894+
headers=headers,
895+
json=payload,
896+
)
897+
response.raise_for_status()
898+
except httpx.TimeoutException as exc:
899+
print(f"ERROR: fastCRW API request timeout after 90s: {exc!r}")
900+
print(f"Endpoint: {search_endpoint}")
901+
return {
902+
"id": 13,
903+
"name": "fastCRW Search",
904+
"type": "CRW_API",
905+
"sources": [],
906+
}, []
907+
except httpx.HTTPStatusError as exc:
908+
print(f"ERROR: fastCRW API HTTP Status Error: {exc.response.status_code}")
909+
print(f"Response text: {exc.response.text[:500]}")
910+
print(f"Request URL: {exc.request.url}")
911+
return {
912+
"id": 13,
913+
"name": "fastCRW Search",
914+
"type": "CRW_API",
915+
"sources": [],
916+
}, []
917+
except httpx.RequestError as exc:
918+
print(f"ERROR: fastCRW API Request Error: {type(exc).__name__}: {exc!r}")
919+
print(f"Endpoint: {search_endpoint}")
920+
return {
921+
"id": 13,
922+
"name": "fastCRW Search",
923+
"type": "CRW_API",
924+
"sources": [],
925+
}, []
926+
except Exception as exc:
927+
print(
928+
f"ERROR: Unexpected error calling fastCRW API: {type(exc).__name__}: {exc!r}"
929+
)
930+
print(f"Endpoint: {search_endpoint}")
931+
return {
932+
"id": 13,
933+
"name": "fastCRW Search",
934+
"type": "CRW_API",
935+
"sources": [],
936+
}, []
937+
938+
try:
939+
data = response.json()
940+
except ValueError as e:
941+
print(f"ERROR: Failed to decode JSON response from fastCRW: {e}")
942+
print(f"Response status: {response.status_code}")
943+
print(f"Response text: {response.text[:500]}") # First 500 chars
944+
return {
945+
"id": 13,
946+
"name": "fastCRW Search",
947+
"type": "CRW_API",
948+
"sources": [],
949+
}, []
950+
951+
# Firecrawl-compatible envelope: failures set success=False and error.
952+
if data.get("success") is False:
953+
print(
954+
f"WARNING: fastCRW API returned error - "
955+
f"Code: {data.get('error_code')}, Message: {data.get('error')}"
956+
)
957+
return {
958+
"id": 13,
959+
"name": "fastCRW Search",
960+
"type": "CRW_API",
961+
"sources": [],
962+
}, []
963+
964+
crw_results = data.get("data", [])
965+
966+
if not crw_results:
967+
return {
968+
"id": 13,
969+
"name": "fastCRW Search",
970+
"type": "CRW_API",
971+
"sources": [],
972+
}, []
973+
974+
sources_list: list[dict[str, Any]] = []
975+
documents: list[dict[str, Any]] = []
976+
977+
async with self.counter_lock:
978+
for result in crw_results:
979+
title = result.get("title", "fastCRW Result")
980+
url = result.get("url", "")
981+
# Prefer the full markdown when present, fall back to the snippet.
982+
content = result.get("markdown") or result.get("description", "")
983+
984+
source = {
985+
"id": self.source_id_counter,
986+
"title": title,
987+
"description": result.get("description", ""),
988+
"url": url,
989+
}
990+
sources_list.append(source)
991+
992+
document = {
993+
"chunk_id": self.source_id_counter,
994+
"content": content,
995+
"score": 1.0, # fastCRW doesn't provide relevance scores
996+
"document": {
997+
"id": self.source_id_counter,
998+
"title": title,
999+
"document_type": "CRW_API",
1000+
"metadata": {
1001+
"url": url,
1002+
"source": "CRW_API",
1003+
},
1004+
},
1005+
}
1006+
documents.append(document)
1007+
self.source_id_counter += 1
1008+
1009+
result_object = {
1010+
"id": 13,
1011+
"name": "fastCRW Search",
1012+
"type": "CRW_API",
1013+
"sources": sources_list,
1014+
}
1015+
1016+
return result_object, documents
1017+
8381018
async def search_slack(
8391019
self,
8401020
user_query: str,

surfsense_backend/app/utils/validators.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,15 @@ def validate_initial_urls() -> None:
513513
],
514514
"validators": {},
515515
},
516+
"CRW_API": {
517+
# Self-host instances may run without auth, so the key is optional.
518+
"required": [],
519+
"optional": [
520+
"CRW_API_KEY",
521+
"CRW_BASE_URL",
522+
],
523+
"validators": {},
524+
},
516525
# "SLACK_CONNECTOR": {
517526
# "required": [], # OAuth uses bot_token (encrypted), legacy uses SLACK_BOT_TOKEN
518527
# "optional": [

0 commit comments

Comments
 (0)