vasu-devs · DeryFerd · May 6, 2026
@@ -146,6 +146,7 @@ def _init_sql():
         ("learning_delta", "INTEGER DEFAULT 0"),
         ("learning_reason", "TEXT DEFAULT ''"),
         ("resume_version", "INTEGER DEFAULT 0"),
+        ("url_hash", "TEXT DEFAULT ''"),
     ]:
         try:
             c.execute(f"ALTER TABLE leads ADD COLUMN {col} {definition}")
@@ -168,7 +169,8 @@ def _init_sql():
     "signal_reason,signal_tags,outreach_reply,outreach_dm,source_meta,feedback,"
     "feedback_note,followup_due_at,last_contacted_at,outreach_email,proposal_draft,"
     "fit_bullets,followup_sequence,proof_snippet,tech_stack,location,urgency,"
-    "base_signal_score,learning_delta,learning_reason,created_at,resume_version"
+    "base_signal_score,learning_delta,learning_reason,created_at,resume_version,"
+    "url_hash"
 )
 
 
@@ -189,6 +191,42 @@ def url_exists(jid: str) -> bool:
     return r is not None
 
 
+def _url_hash(url: str) -> str:
+    """Normalized SHA-256 hash of a URL for dedup comparison.
+
+    Strips trailing slashes, query params order, fragments, and whitespace
+    so that minor URL variations resolve to the same hash.
+    """
+    import hashlib
+    from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
+
+    if not url or not url.strip():
+        return ""
+    parsed = urlparse(url.strip())
+    # Sort query params for normalization
+    sorted_qs = urlencode(sorted(parse_qs(parsed.query, keep_blank_values=True).items()), doseq=True)
+    normalized = urlunparse((
+        parsed.scheme.lower(),
+        parsed.netloc.lower(),
+        parsed.path.rstrip("/"),
+        parsed.params,
+        sorted_qs,
+        "",  # drop fragment
+    ))
+    return hashlib.sha256(normalized.encode()).hexdigest()[:32]
+
+
+def url_exists_by_url(url: str) -> bool:
+    """Check if a lead with the same URL already exists (by normalized hash)."""
+    h = _url_hash(url)
+    if not h:
+        return False
+    c = _sq.connect(sql)
+    r = c.execute("SELECT 1 FROM leads WHERE url_hash=?", (h,)).fetchone()
+    c.close()
+    return r is not None
+
+
 def save_lead(
     jid: str,
     t: str,
@@ -255,8 +293,8 @@ def save_lead(
             signal_score,signal_reason,signal_tags,outreach_reply,outreach_dm,
             outreach_email,proposal_draft,fit_bullets,followup_sequence,
             proof_snippet,tech_stack,location,urgency,base_signal_score,
-            learning_delta,learning_reason,source_meta
-        ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
+            learning_delta,learning_reason,source_meta,url_hash
+        ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
         """,
         (
             jid, t, co, u, plat, desc, lead.get("kind") or "job", lead.get("budget") or "",
@@ -273,6 +311,7 @@ def save_lead(
             int(lead.get("learning_delta") or 0),
             str(lead.get("learning_reason") or "")[:700],
             json.dumps(lead.get("source_meta") or {}, ensure_ascii=False),
+            _url_hash(u),
         ),
     )
     c.commit()
@@ -437,6 +476,7 @@ def _lead_row_dict(r) -> dict:
         "learning_reason": r[36] or "",
         "created_at": r[37] or "",
         "resume_version": r[38] or 0,
+        "url_hash": r[39] or "",
     }
 
 
@@ -1559,3 +1599,29 @@ def _add_project_vec(pid: str, title: str, stack: str, impact: str):
                 vec.create_table("projects", data=rows)
     except Exception:
         pass
+
+
+def backfill_url_hashes(limit: int = 5000) -> int:
+    """Compute and store url_hash for leads that don't have one yet.
+
+    Returns the number of leads updated.
+    """
+    c = _sq.connect(sql)
+    rows = c.execute(
+        f"SELECT job_id, url FROM leads "
+        "WHERE COALESCE(url_hash, '') = '' AND COALESCE(url, '') != '' "
+        "ORDER BY created_at DESC LIMIT ?",
+        (max(1, min(int(limit or 5000), 10000)),),
+    ).fetchall()
+
+    updated = 0
+    for row in rows:
+        jid, url = row
+        h = _url_hash(url)
+        if h:
+            c.execute("UPDATE leads SET url_hash=? WHERE job_id=?", (h, jid))
+            updated += 1
+
+    c.commit()
+    c.close()
+    return updated
@@ -1000,5 +1000,38 @@ def test_red_flag_lead_is_rejected(self):
         self.assertIn("red flags", quality["reason"])
 
 
+class TestUrlHashDedup(unittest.TestCase):
+    """Verify that _url_hash normalizes URLs so minor variations resolve
+    to the same hash, enabling dedup across different scrapers."""
+
+    def _hash(self, url: str) -> str:
+        from db.client import _url_hash
+        return _url_hash(url)
+
+    def test_trailing_slash_normalized(self):
+        self.assertEqual(self._hash("https://jobs.example.com/posting"),
+                         self._hash("https://jobs.example.com/posting/"))
+
+    def test_fragment_stripped(self):
+        self.assertEqual(self._hash("https://jobs.example.com/posting"),
+                         self._hash("https://jobs.example.com/posting#section"))
+
+    def test_case_insensitive_scheme_and_host(self):
+        self.assertEqual(self._hash("https://Jobs.Example.COM/posting"),
+                         self._hash("https://jobs.example.com/posting"))
+
+    def test_query_param_order_normalized(self):
+        self.assertEqual(self._hash("https://jobs.example.com/search?q=python&loc=remote"),
+                         self._hash("https://jobs.example.com/search?loc=remote&q=python"))
+
+    def test_empty_url_returns_empty_hash(self):
+        self.assertEqual(self._hash(""), "")
+        self.assertEqual(self._hash("   "), "")
+
+    def test_different_paths_are_different(self):
+        self.assertNotEqual(self._hash("https://jobs.example.com/posting-a"),
+                            self._hash("https://jobs.example.com/posting-b"))
+
+
 if __name__ == "__main__":
     unittest.main()