@@ -835,6 +835,186 @@ async def search_baidu(
835835
836836 return result_object , documents
837837
838+ async def search_crw (
839+ self ,
840+ user_query : str ,
841+ search_space_id : int ,
842+ top_k : int = 20 ,
843+ ) -> tuple :
844+ """
845+ Search using fastCRW and return both sources and documents.
846+
847+ fastCRW is a Firecrawl-compatible web scraper (single binary; self-host
848+ or cloud). Results come from the ``POST /v1/search`` endpoint, which
849+ returns a ``{success, data: [{title, url, description, markdown?}]}``
850+ envelope.
851+
852+ Args:
853+ user_query: User's search query
854+ search_space_id: Search space ID
855+ top_k: Maximum number of results to return
856+
857+ Returns:
858+ tuple: (sources_info_dict, documents_list)
859+ """
860+ # Get CRW connector configuration
861+ crw_connector = await self .get_connector_by_type (
862+ SearchSourceConnectorType .CRW_API , search_space_id
863+ )
864+
865+ if not crw_connector :
866+ return {
867+ "id" : 13 ,
868+ "name" : "fastCRW Search" ,
869+ "type" : "CRW_API" ,
870+ "sources" : [],
871+ }, []
872+
873+ config = crw_connector .config or {}
874+ api_key = config .get ("CRW_API_KEY" )
875+
876+ # Default to the managed cloud; allow self-host override via CRW_BASE_URL.
877+ base_url = (config .get ("CRW_BASE_URL" ) or "https://fastcrw.com/api" ).rstrip ("/" )
878+ search_endpoint = f"{ base_url } /v1/search"
879+
880+ # Bearer auth (self-host instances may run without auth → key optional).
881+ headers = {"Content-Type" : "application/json" }
882+ if api_key :
883+ headers ["Authorization" ] = f"Bearer { api_key } "
884+
885+ payload = {
886+ "query" : user_query ,
887+ "limit" : top_k ,
888+ }
889+
890+ try :
891+ async with httpx .AsyncClient (timeout = 90.0 ) as client :
892+ response = await client .post (
893+ search_endpoint ,
894+ headers = headers ,
895+ json = payload ,
896+ )
897+ response .raise_for_status ()
898+ except httpx .TimeoutException as exc :
899+ print (f"ERROR: fastCRW API request timeout after 90s: { exc !r} " )
900+ print (f"Endpoint: { search_endpoint } " )
901+ return {
902+ "id" : 13 ,
903+ "name" : "fastCRW Search" ,
904+ "type" : "CRW_API" ,
905+ "sources" : [],
906+ }, []
907+ except httpx .HTTPStatusError as exc :
908+ print (f"ERROR: fastCRW API HTTP Status Error: { exc .response .status_code } " )
909+ print (f"Response text: { exc .response .text [:500 ]} " )
910+ print (f"Request URL: { exc .request .url } " )
911+ return {
912+ "id" : 13 ,
913+ "name" : "fastCRW Search" ,
914+ "type" : "CRW_API" ,
915+ "sources" : [],
916+ }, []
917+ except httpx .RequestError as exc :
918+ print (f"ERROR: fastCRW API Request Error: { type (exc ).__name__ } : { exc !r} " )
919+ print (f"Endpoint: { search_endpoint } " )
920+ return {
921+ "id" : 13 ,
922+ "name" : "fastCRW Search" ,
923+ "type" : "CRW_API" ,
924+ "sources" : [],
925+ }, []
926+ except Exception as exc :
927+ print (
928+ f"ERROR: Unexpected error calling fastCRW API: { type (exc ).__name__ } : { exc !r} "
929+ )
930+ print (f"Endpoint: { search_endpoint } " )
931+ return {
932+ "id" : 13 ,
933+ "name" : "fastCRW Search" ,
934+ "type" : "CRW_API" ,
935+ "sources" : [],
936+ }, []
937+
938+ try :
939+ data = response .json ()
940+ except ValueError as e :
941+ print (f"ERROR: Failed to decode JSON response from fastCRW: { e } " )
942+ print (f"Response status: { response .status_code } " )
943+ print (f"Response text: { response .text [:500 ]} " ) # First 500 chars
944+ return {
945+ "id" : 13 ,
946+ "name" : "fastCRW Search" ,
947+ "type" : "CRW_API" ,
948+ "sources" : [],
949+ }, []
950+
951+ # Firecrawl-compatible envelope: failures set success=False and error.
952+ if data .get ("success" ) is False :
953+ print (
954+ f"WARNING: fastCRW API returned error - "
955+ f"Code: { data .get ('error_code' )} , Message: { data .get ('error' )} "
956+ )
957+ return {
958+ "id" : 13 ,
959+ "name" : "fastCRW Search" ,
960+ "type" : "CRW_API" ,
961+ "sources" : [],
962+ }, []
963+
964+ crw_results = data .get ("data" , [])
965+
966+ if not crw_results :
967+ return {
968+ "id" : 13 ,
969+ "name" : "fastCRW Search" ,
970+ "type" : "CRW_API" ,
971+ "sources" : [],
972+ }, []
973+
974+ sources_list : list [dict [str , Any ]] = []
975+ documents : list [dict [str , Any ]] = []
976+
977+ async with self .counter_lock :
978+ for result in crw_results :
979+ title = result .get ("title" , "fastCRW Result" )
980+ url = result .get ("url" , "" )
981+ # Prefer the full markdown when present, fall back to the snippet.
982+ content = result .get ("markdown" ) or result .get ("description" , "" )
983+
984+ source = {
985+ "id" : self .source_id_counter ,
986+ "title" : title ,
987+ "description" : result .get ("description" , "" ),
988+ "url" : url ,
989+ }
990+ sources_list .append (source )
991+
992+ document = {
993+ "chunk_id" : self .source_id_counter ,
994+ "content" : content ,
995+ "score" : 1.0 , # fastCRW doesn't provide relevance scores
996+ "document" : {
997+ "id" : self .source_id_counter ,
998+ "title" : title ,
999+ "document_type" : "CRW_API" ,
1000+ "metadata" : {
1001+ "url" : url ,
1002+ "source" : "CRW_API" ,
1003+ },
1004+ },
1005+ }
1006+ documents .append (document )
1007+ self .source_id_counter += 1
1008+
1009+ result_object = {
1010+ "id" : 13 ,
1011+ "name" : "fastCRW Search" ,
1012+ "type" : "CRW_API" ,
1013+ "sources" : sources_list ,
1014+ }
1015+
1016+ return result_object , documents
1017+
8381018 async def search_slack (
8391019 self ,
8401020 user_query : str ,
0 commit comments