Merge pull request #1303 from PyThaiNLP/copilot/add-attaparse-dependency-parser

bact · web-flow · commit fa4db9a13613 · 2026-03-03T15:59:37.000+07:00
Add Attaparse engine to dependency parser
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,8 @@ See PR for prompt and details.
 - Reorganize noauto test suite by dependency groups
   (torch, tensorflow, onnx, cython, network) #1290
 - Add BLEU, ROUGE, WER, and CER metrics to pythainlp.benchmarks #1295
+- Add Attaparse engine to dependency parser
+  (`dependency_parsing`, engine="attaparse") #1303
 - Improved documentation; code cleanup; more tests
 
 ## Version 5.1.2 -> 5.2.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -92,13 +92,16 @@ abbreviation = ["khamyo>=0.2.0"]
 
 attacut = ["attacut>=1.0.6"]
 
+attaparse = ["attaparse>=1.0.0"]
+
 benchmarks = ["numpy>=1.22", "pandas>=0.24", "PyYAML>=5.4.1"]
 
 budoux = ["budoux>=0.7.0"]
 
 coreference_resolution = ["fastcoref>=2.1.5", "spacy>=3.0"]
 
 dependency_parsing = [
+    "attaparse>=1.0.0",
     "spacy_thai>=0.7.1",
     "transformers>=4.22.1",
     "ufal.chu-liu-edmonds>=1.0.2",
@@ -237,6 +240,7 @@ noauto-network = [
 
 # Full dependencies - pinned where available
 full = [
+    "attaparse==1.0.0",
     "attacut==1.0.6",
     "bpemb>=0.3.6,<0.4",
     "budoux==0.7.0",
@@ -441,6 +445,7 @@ disallow_incomplete_defs = true
 [[tool.mypy.overrides]]
 module = [
     "attacut.*",
+    "attaparse.*",
     "bpemb.*",
     "budoux.*",
     "deepcut.*",
@@ -472,6 +477,7 @@ module = [
     "spacy.*",
     "spacy_thai.*",
     "ssg.*",
+    "stanza.*",
     "symspellpy.*",
     "thai_nner.*",
     "tltk.*",
diff --git a/pythainlp/parse/attaparse_engine.py b/pythainlp/parse/attaparse_engine.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""Attaparse: Thai dependency parser based on Stanza and PhayaThaiBERT.
+
+GitHub: https://github.com/nlp-chula/attaparse
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Union
+
+try:
+    from attaparse import depparse, load_model
+except ImportError:
+    raise ImportError(
+        "Import Error; Install attaparse by pip install attaparse"
+    )
+
+if TYPE_CHECKING:
+    from stanza import Pipeline
+
+
+class Parse:
+    def __init__(self) -> None:
+        self.nlp: Pipeline = load_model()
+
+    def __call__(
+        self, text: str, tag: str = "str"
+    ) -> Union[List[List[str]], str]:
+        doc = depparse(text, self.nlp)
+        rows = []
+        for sent in doc.sentences:
+            for word in sent.words:
+                row = [
+                    str(word.id),
+                    word.text,
+                    word.lemma if word.lemma else "_",
+                    word.upos if word.upos else "_",
+                    word.xpos if word.xpos else "_",
+                    word.feats if word.feats else "_",
+                    str(word.head),
+                    word.deprel if word.deprel else "_",
+                    "_",  # DEPS (enhanced dependencies, not provided)
+                    "SpaceAfter=No",  # MISC: Thai text has no inter-word spaces
+                ]
+                rows.append(row)
+        if tag == "list":
+            return rows
+        return "\n".join("\t".join(row) for row in rows)
diff --git a/pythainlp/parse/core.py b/pythainlp/parse/core.py
@@ -36,6 +36,8 @@ def dependency_parsing(
             `GitHub <https://github.com/KoichiYasuoka/>`_
         * *ud_goeswith* - POS tagging and dependency parsing \
             using `goeswith` for subwords
+        * *attaparse* - Thai dependency parser using Stanza and PhayaThaiBERT. \
+            `GitHub <https://github.com/nlp-chula/attaparse>`_
 
     **Options for model (esupar engine)**
         * *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
@@ -121,6 +123,10 @@ def dependency_parsing(
                 if model
                 else "KoichiYasuoka/deberta-base-thai-ud-goeswith"
             )
+        elif engine == "attaparse":
+            from pythainlp.parse.attaparse_engine import Parse  # type: ignore[assignment]  # noqa: I001
+
+            _tagger = Parse()
         else:
             raise NotImplementedError("The engine doesn't support.")
 
diff --git a/tests/extra/testx_parse.py b/tests/extra/testx_parse.py
@@ -19,3 +19,7 @@ def test_dependency_parsing(self):
         # self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai", tag="list"))
         # self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="ud_goeswith"))
         # self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="ud_goeswith", tag="list"))
+        self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="attaparse"))
+        self.assertIsNotNone(
+            dependency_parsing("ผมเป็นคนดี", engine="attaparse", tag="list")
+        )