Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions surfsense_backend/alembic/versions/160_add_crw_api_enum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Add CRW_API connector enum value

Revision ID: 160
Revises: 159
Create Date: 2026-06-13 00:00:00.000000

"""

from collections.abc import Sequence

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "160"
down_revision: str | None = "159"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None


def upgrade() -> None:
"""Safely add CRW_API to searchsourceconnectortype enum."""
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'CRW_API'
) THEN
ALTER TYPE searchsourceconnectortype ADD VALUE 'CRW_API';
END IF;
END
$$;
"""
)


def downgrade() -> None:
"""Downgrade not supported for enum edits."""
pass
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@

# Maps SearchSourceConnectorType enum values to the searchable document/connector types
# used by pre-search middleware and web_search.
# Live search connectors (TAVILY_API, LINKUP_API, BAIDU_SEARCH_API) are routed to
# the web_search tool; all others are considered local/indexed data.
# Live search connectors (TAVILY_API, LINKUP_API, BAIDU_SEARCH_API, CRW_API) are
# routed to the web_search tool; all others are considered local/indexed data.
_CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
# Live search connectors (handled by web_search tool)
"TAVILY_API": "TAVILY_API",
"LINKUP_API": "LINKUP_API",
"BAIDU_SEARCH_API": "BAIDU_SEARCH_API",
"CRW_API": "CRW_API",
# Local/indexed connectors (handled by KB pre-search middleware)
"SLACK_CONNECTOR": "SLACK_CONNECTOR",
"TEAMS_CONNECTOR": "TEAMS_CONNECTOR",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"TAVILY_API",
"LINKUP_API",
"BAIDU_SEARCH_API",
"CRW_API",
}

# Patterns that indicate the query has no meaningful search signal.
Expand Down Expand Up @@ -455,6 +456,7 @@ def format_documents_for_context(
"TAVILY_API",
"LINKUP_API",
"BAIDU_SEARCH_API",
"CRW_API",
}

parts: list[str] = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,21 @@
"TAVILY_API",
"LINKUP_API",
"BAIDU_SEARCH_API",
"CRW_API",
}

_LIVE_CONNECTOR_SPECS: dict[str, tuple[str, bool, bool, dict[str, Any]]] = {
"TAVILY_API": ("search_tavily", False, True, {}),
"LINKUP_API": ("search_linkup", False, False, {"mode": "standard"}),
"BAIDU_SEARCH_API": ("search_baidu", False, True, {}),
"CRW_API": ("search_crw", False, True, {}),
}

_CONNECTOR_LABELS: dict[str, str] = {
"TAVILY_API": "Tavily",
"LINKUP_API": "Linkup",
"BAIDU_SEARCH_API": "Baidu",
"CRW_API": "fastCRW",
}


Expand Down
3 changes: 3 additions & 0 deletions surfsense_backend/app/agents/chat/shared/tools/web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,21 @@
"TAVILY_API",
"LINKUP_API",
"BAIDU_SEARCH_API",
"CRW_API",
}

_LIVE_CONNECTOR_SPECS: dict[str, tuple[str, bool, bool, dict[str, Any]]] = {
"TAVILY_API": ("search_tavily", False, True, {}),
"LINKUP_API": ("search_linkup", False, False, {"mode": "standard"}),
"BAIDU_SEARCH_API": ("search_baidu", False, True, {}),
"CRW_API": ("search_crw", False, True, {}),
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

_CONNECTOR_LABELS: dict[str, str] = {
"TAVILY_API": "Tavily",
"LINKUP_API": "Linkup",
"BAIDU_SEARCH_API": "Baidu",
"CRW_API": "fastCRW",
}


Expand Down
1 change: 1 addition & 0 deletions surfsense_backend/app/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class SearchSourceConnectorType(StrEnum):
SEARXNG_API = "SEARXNG_API"
LINKUP_API = "LINKUP_API"
BAIDU_SEARCH_API = "BAIDU_SEARCH_API" # Baidu AI Search API for Chinese web search
CRW_API = "CRW_API" # fastCRW — Firecrawl-compatible web scraper; self-host or cloud
SLACK_CONNECTOR = "SLACK_CONNECTOR"
TEAMS_CONNECTOR = "TEAMS_CONNECTOR"
ONEDRIVE_CONNECTOR = "ONEDRIVE_CONNECTOR"
Expand Down
1 change: 1 addition & 0 deletions surfsense_backend/app/services/ai_file_sort_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
SearchSourceConnectorType.SEARXNG_API: "SearXNG Search",
SearchSourceConnectorType.LINKUP_API: "Linkup Search",
SearchSourceConnectorType.BAIDU_SEARCH_API: "Baidu Search",
SearchSourceConnectorType.CRW_API: "fastCRW Search",
SearchSourceConnectorType.SLACK_CONNECTOR: "Slack",
SearchSourceConnectorType.TEAMS_CONNECTOR: "Teams",
SearchSourceConnectorType.ONEDRIVE_CONNECTOR: "OneDrive",
Expand Down
180 changes: 180 additions & 0 deletions surfsense_backend/app/services/connector_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,186 @@ async def search_baidu(

return result_object, documents

async def search_crw(
self,
user_query: str,
search_space_id: int,
top_k: int = 20,
) -> tuple:
"""
Search using fastCRW and return both sources and documents.

fastCRW is a Firecrawl-compatible web scraper (single binary; self-host
or cloud). Results come from the ``POST /v1/search`` endpoint, which
returns a ``{success, data: [{title, url, description, markdown?}]}``
envelope.

Args:
user_query: User's search query
search_space_id: Search space ID
top_k: Maximum number of results to return

Returns:
tuple: (sources_info_dict, documents_list)
"""
# Get CRW connector configuration
crw_connector = await self.get_connector_by_type(
SearchSourceConnectorType.CRW_API, search_space_id
)

if not crw_connector:
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []

config = crw_connector.config or {}
api_key = config.get("CRW_API_KEY")

# Default to the managed cloud; allow self-host override via CRW_BASE_URL.
base_url = (config.get("CRW_BASE_URL") or "https://fastcrw.com/api").rstrip("/")
search_endpoint = f"{base_url}/v1/search"

# Bearer auth (self-host instances may run without auth → key optional).
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"

payload = {
"query": user_query,
"limit": top_k,
}

try:
async with httpx.AsyncClient(timeout=90.0) as client:
response = await client.post(
search_endpoint,
headers=headers,
json=payload,
)
response.raise_for_status()
except httpx.TimeoutException as exc:
print(f"ERROR: fastCRW API request timeout after 90s: {exc!r}")
print(f"Endpoint: {search_endpoint}")
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []
except httpx.HTTPStatusError as exc:
print(f"ERROR: fastCRW API HTTP Status Error: {exc.response.status_code}")
print(f"Response text: {exc.response.text[:500]}")
print(f"Request URL: {exc.request.url}")
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []
except httpx.RequestError as exc:
print(f"ERROR: fastCRW API Request Error: {type(exc).__name__}: {exc!r}")
print(f"Endpoint: {search_endpoint}")
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []
except Exception as exc:
print(
f"ERROR: Unexpected error calling fastCRW API: {type(exc).__name__}: {exc!r}"
)
print(f"Endpoint: {search_endpoint}")
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []

try:
data = response.json()
except ValueError as e:
print(f"ERROR: Failed to decode JSON response from fastCRW: {e}")
print(f"Response status: {response.status_code}")
print(f"Response text: {response.text[:500]}") # First 500 chars
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []

# Firecrawl-compatible envelope: failures set success=False and error.
if data.get("success") is False:
print(
f"WARNING: fastCRW API returned error - "
f"Code: {data.get('error_code')}, Message: {data.get('error')}"
)
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []

crw_results = data.get("data", [])

if not crw_results:
return {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": [],
}, []

sources_list: list[dict[str, Any]] = []
documents: list[dict[str, Any]] = []

async with self.counter_lock:
for result in crw_results:
title = result.get("title", "fastCRW Result")
url = result.get("url", "")
# Prefer the full markdown when present, fall back to the snippet.
content = result.get("markdown") or result.get("description", "")

Comment thread
coderabbitai[bot] marked this conversation as resolved.
source = {
"id": self.source_id_counter,
"title": title,
"description": result.get("description", ""),
"url": url,
}
sources_list.append(source)

document = {
"chunk_id": self.source_id_counter,
"content": content,
"score": 1.0, # fastCRW doesn't provide relevance scores
"document": {
"id": self.source_id_counter,
"title": title,
"document_type": "CRW_API",
"metadata": {
"url": url,
"source": "CRW_API",
},
},
}
documents.append(document)
self.source_id_counter += 1

result_object = {
"id": 13,
"name": "fastCRW Search",
"type": "CRW_API",
"sources": sources_list,
}

return result_object, documents

async def search_slack(
self,
user_query: str,
Expand Down
9 changes: 9 additions & 0 deletions surfsense_backend/app/utils/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,15 @@ def validate_initial_urls() -> None:
],
"validators": {},
},
"CRW_API": {
# Self-host instances may run without auth, so the key is optional.
"required": [],
"optional": [
"CRW_API_KEY",
"CRW_BASE_URL",
],
"validators": {},
},
Comment thread
coderabbitai[bot] marked this conversation as resolved.
# "SLACK_CONNECTOR": {
# "required": [], # OAuth uses bot_token (encrypted), legacy uses SLACK_BOT_TOKEN
# "optional": [
Expand Down
Loading