Skip to content
7 changes: 7 additions & 0 deletions app/reviews/autoreview/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .invalid_isbn import check_invalid_isbn
from .manual_unapproval import check_manual_unapproval
from .ores_scores import check_ores_scores
from .reference_only_edit import check_reference_only_edit
from .render_errors import check_render_errors
from .superseded_additions import check_superseded_additions
from .user_block import check_user_block
Expand Down Expand Up @@ -67,6 +68,12 @@
"function": check_invalid_isbn,
"priority": 8,
},
{
"id": "reference-only-edit",
"name": "Reference-only edit",
"function": check_reference_only_edit,
"priority": 8.5,
},
{
"id": "superseded-additions",
"name": "Superseded additions",
Expand Down
140 changes: 140 additions & 0 deletions app/reviews/autoreview/checks/reference_only_edit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from __future__ import annotations

import logging

from ..base import CheckResult
from ..context import CheckContext
from ..decision import AutoreviewDecision
from ..utils.wikitext import (
extract_domain_from_url,
extract_urls_from_references,
get_parent_wikitext,
is_reference_only_edit,
)

logger = logging.getLogger(__name__)


def check_reference_only_edit(context: CheckContext) -> CheckResult:
"""Check if revision only adds or modifies references."""
pending_wikitext = context.revision.get_wikitext()
parent_wikitext = get_parent_wikitext(context.revision)

is_ref_only, has_removals, new_or_modified_refs = is_reference_only_edit(
parent_wikitext, pending_wikitext
)

if not is_ref_only:
return CheckResult(
check_id="reference-only-edit",
check_title="Reference-only edit detection",
status="skip",
message="Edit modifies content beyond references.",
)

if has_removals and not new_or_modified_refs:
return CheckResult(
check_id="reference-only-edit",
check_title="Reference-only edit detection",
status="not_ok",
message="Edit only removes references without adding new ones.",
decision=AutoreviewDecision(
status="manual",
label="Requires manual review",
reason="Reference-only edits that only remove references require manual review.",
),
should_stop=True,
)

if not new_or_modified_refs:
return CheckResult(
check_id="reference-only-edit",
check_title="Reference-only edit detection",
status="skip",
message="No new or modified references detected.",
)

urls = extract_urls_from_references(new_or_modified_refs)

if not urls:
logger.info(
"Auto-approving reference-only edit %s (no URLs in new references)",
context.revision.revid,
)
ref_count = len(new_or_modified_refs)
return CheckResult(
check_id="reference-only-edit",
check_title="Reference-only edit detection",
status="ok",
message=f"Edit only modifies references ({ref_count} reference(s) added/modified).",
decision=AutoreviewDecision(
status="approve",
label="Can be auto-approved",
reason="Edit only adds or modifies references without external URLs.",
),
should_stop=True,
)

domains = []
for url in urls:
domain = extract_domain_from_url(url)
if domain:
domains.append(domain)

new_domains = []
checked_domains = set()

for domain in domains:
if domain in checked_domains:
continue
checked_domains.add(domain)

has_been_used = context.client.has_domain_been_used(domain)

if not has_been_used:
new_domains.append(domain)
logger.info(
"Domain %s has not been used before in revision %s",
domain,
context.revision.revid,
)

if new_domains:
domain_list = ", ".join(new_domains[:3])
if len(new_domains) > 3:
domain_list += "..."
domain_count = len(new_domains)

return CheckResult(
check_id="reference-only-edit",
check_title="Reference-only edit detection",
status="not_ok",
message=f"Edit adds references with new domain(s): {domain_list}",
decision=AutoreviewDecision(
status="manual",
label="Requires manual review",
reason=f"Reference-only edit contains {domain_count} previously unused domain(s).",
),
should_stop=True,
)

logger.info(
"Auto-approving reference-only edit %s with %s known domain(s)",
context.revision.revid,
len(checked_domains),
)
ref_count = len(new_or_modified_refs)
domain_count = len(checked_domains)
return CheckResult(
check_id="reference-only-edit",
check_title="Reference-only edit detection",
status="ok",
message=f"Edit only modifies references ({ref_count} reference(s) with "
f"{domain_count} known domain(s)).",
decision=AutoreviewDecision(
status="approve",
label="Can be auto-approved",
reason="Edit only adds or modifies references with known domains.",
),
should_stop=True,
)
130 changes: 127 additions & 3 deletions app/reviews/autoreview/utils/wikitext.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def extract_additions(parent_wikitext: str, pending_wikitext: str) -> list[str]:


def get_parent_wikitext(revision: PendingRevision) -> str:
"""Get parent revision wikitext from local database."""
"""Get parent revision wikitext from local database or API."""
cached_parent = getattr(revision, "parent_wikitext", None)
if isinstance(cached_parent, str) and cached_parent:
return cached_parent
Expand All @@ -67,9 +67,133 @@ def get_parent_wikitext(revision: PendingRevision) -> str:
parent_revision = PR.objects.get(page=revision.page, revid=parentid)
return parent_revision.get_wikitext()
except Exception:
logger.warning(
"Parent revision %s not found in local database for revision %s",
logger.info(
"Parent revision %s not in local database, fetching from API for revision %s",
revision.parentid,
revision.revid,
)
try:
import pywikibot

wiki = revision.page.wiki
site = pywikibot.Site(code=wiki.code, fam=wiki.family)
request = site.simple_request(
action="query",
prop="revisions",
revids=str(parentid),
rvprop="content",
rvslots="main",
formatversion=2,
)
response = request.submit()
pages = response.get("query", {}).get("pages", [])

if pages and len(pages) > 0:
revisions = pages[0].get("revisions", [])
if revisions and len(revisions) > 0:
slots = revisions[0].get("slots", {})
main_slot = slots.get("main", {})
content = main_slot.get("content", "")
if content:
logger.info("Fetched parent revision %s from API", parentid)
return content

logger.warning("Could not fetch parent revision %s from API", parentid)
return ""
except Exception as e:
logger.exception("Error fetching parent revision %s from API: %s", parentid, e)
return ""


def extract_references(text: str) -> list[str]:
"""Extract all reference tags from wikitext."""
if not text:
return []

references = []
ref_pattern = r"<ref(?:\s+[^>]*)?>(?:.*?)</ref>|<ref(?:\s+[^>]*)?/>"

for match in re.finditer(ref_pattern, text, re.IGNORECASE | re.DOTALL):
references.append(match.group(0))

return references


def strip_references(text: str) -> str:
"""Remove all reference tags from wikitext."""
if not text:
return ""

ref_pattern = r"<ref(?:\s+[^>]*)?>(?:.*?)</ref>|<ref(?:\s+[^>]*)?/>"
cleaned = re.sub(ref_pattern, "", text, flags=re.IGNORECASE | re.DOTALL)

return cleaned


def is_reference_only_edit(
parent_wikitext: str, pending_wikitext: str
) -> tuple[bool, bool, list[str]]:
"""Check if edit only modifies references without changing other content.

Returns:
tuple: (is_reference_only, has_removals, added_or_modified_refs)
- is_reference_only: True if only references changed
- has_removals: True if any references were removed
- added_or_modified_refs: List of new/modified reference content
"""
if not pending_wikitext:
return False, False, []

parent_without_refs = strip_references(parent_wikitext or "")
pending_without_refs = strip_references(pending_wikitext)

parent_normalized = re.sub(r"\s+", " ", parent_without_refs).strip()
pending_normalized = re.sub(r"\s+", " ", pending_without_refs).strip()

if parent_normalized != pending_normalized:
return False, False, []

parent_refs = set(extract_references(parent_wikitext or ""))
pending_refs = set(extract_references(pending_wikitext))

if not parent_refs and not pending_refs:
return False, False, []

has_removals = len(parent_refs - pending_refs) > 0
added_or_modified = list(pending_refs - parent_refs)

if not added_or_modified and not has_removals:
return False, False, []

return True, has_removals, added_or_modified


def extract_urls_from_references(references: list[str]) -> list[str]:
"""Extract all URLs from reference tags."""
urls = []
url_pattern = r'https?://[^\s\]<>"\'\|\{\}]+(?:\([^\s\)]*\))?'

for ref in references:
for match in re.finditer(url_pattern, ref, re.IGNORECASE):
url = match.group(0)
url = url.rstrip(".,;:!?}")
urls.append(url)

return urls


def extract_domain_from_url(url: str) -> str | None:
"""Extract domain from URL without protocol, path, or query string."""
from urllib.parse import urlparse

try:
parsed = urlparse(url)
domain = parsed.netloc.lower()

if domain.startswith("www."):
domain = domain[4:]

return domain if domain else None
except Exception:
logger.warning("Failed to parse URL: %s", url)
return None
16 changes: 16 additions & 0 deletions app/reviews/services/wiki_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,3 +357,19 @@ def fetch_review_statistics(self, days: int = 30) -> dict:
"""
stats_client = StatisticsClient(wiki=self.wiki, site=self.site)
return stats_client.fetch_all_statistics(days=days, clear_existing=True)

def has_domain_been_used(self, domain: str) -> bool:
"""Check if domain has been used in Wikipedia articles (namespace=0)."""
if not domain:
return False

try:
ext_url_usage = self.site.exturlusage(url=domain, namespaces=[0], total=1)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If total=1 then this will match to itself if it still exists in latest revision. You can somewhat mitigate self-references by adding total=2 and check if there is least 2 links.


for _ in ext_url_usage:
return True

return False
except Exception:
logger.exception("Failed to check domain usage for: %s", domain)
return False
Loading