Skip to content

Commit fa4db9a

Browse files
authored
Merge pull request #1303 from PyThaiNLP/copilot/add-attaparse-dependency-parser
Add Attaparse engine to dependency parser
2 parents b008610 + 3d0ae9a commit fa4db9a

File tree

5 files changed

+68
-0
lines changed

5 files changed

+68
-0
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ See PR for prompt and details.
3838
- Reorganize noauto test suite by dependency groups
3939
(torch, tensorflow, onnx, cython, network) #1290
4040
- Add BLEU, ROUGE, WER, and CER metrics to pythainlp.benchmarks #1295
41+
- Add Attaparse engine to dependency parser
42+
(`dependency_parsing`, engine="attaparse") #1303
4143
- Improved documentation; code cleanup; more tests
4244

4345
## Version 5.1.2 -> 5.2.0

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,16 @@ abbreviation = ["khamyo>=0.2.0"]
9292

9393
attacut = ["attacut>=1.0.6"]
9494

95+
attaparse = ["attaparse>=1.0.0"]
96+
9597
benchmarks = ["numpy>=1.22", "pandas>=0.24", "PyYAML>=5.4.1"]
9698

9799
budoux = ["budoux>=0.7.0"]
98100

99101
coreference_resolution = ["fastcoref>=2.1.5", "spacy>=3.0"]
100102

101103
dependency_parsing = [
104+
"attaparse>=1.0.0",
102105
"spacy_thai>=0.7.1",
103106
"transformers>=4.22.1",
104107
"ufal.chu-liu-edmonds>=1.0.2",
@@ -237,6 +240,7 @@ noauto-network = [
237240

238241
# Full dependencies - pinned where available
239242
full = [
243+
"attaparse==1.0.0",
240244
"attacut==1.0.6",
241245
"bpemb>=0.3.6,<0.4",
242246
"budoux==0.7.0",
@@ -441,6 +445,7 @@ disallow_incomplete_defs = true
441445
[[tool.mypy.overrides]]
442446
module = [
443447
"attacut.*",
448+
"attaparse.*",
444449
"bpemb.*",
445450
"budoux.*",
446451
"deepcut.*",
@@ -472,6 +477,7 @@ module = [
472477
"spacy.*",
473478
"spacy_thai.*",
474479
"ssg.*",
480+
"stanza.*",
475481
"symspellpy.*",
476482
"thai_nner.*",
477483
"tltk.*",
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2+
# SPDX-FileType: SOURCE
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""Attaparse: Thai dependency parser based on Stanza and PhayaThaiBERT.
5+
6+
GitHub: https://github.com/nlp-chula/attaparse
7+
"""
8+
9+
from __future__ import annotations
10+
11+
from typing import TYPE_CHECKING, List, Union
12+
13+
try:
14+
from attaparse import depparse, load_model
15+
except ImportError:
16+
raise ImportError(
17+
"Import Error; Install attaparse by pip install attaparse"
18+
)
19+
20+
if TYPE_CHECKING:
21+
from stanza import Pipeline
22+
23+
24+
class Parse:
25+
def __init__(self) -> None:
26+
self.nlp: Pipeline = load_model()
27+
28+
def __call__(
29+
self, text: str, tag: str = "str"
30+
) -> Union[List[List[str]], str]:
31+
doc = depparse(text, self.nlp)
32+
rows = []
33+
for sent in doc.sentences:
34+
for word in sent.words:
35+
row = [
36+
str(word.id),
37+
word.text,
38+
word.lemma if word.lemma else "_",
39+
word.upos if word.upos else "_",
40+
word.xpos if word.xpos else "_",
41+
word.feats if word.feats else "_",
42+
str(word.head),
43+
word.deprel if word.deprel else "_",
44+
"_", # DEPS (enhanced dependencies, not provided)
45+
"SpaceAfter=No", # MISC: Thai text has no inter-word spaces
46+
]
47+
rows.append(row)
48+
if tag == "list":
49+
return rows
50+
return "\n".join("\t".join(row) for row in rows)

pythainlp/parse/core.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ def dependency_parsing(
3636
`GitHub <https://github.com/KoichiYasuoka/>`_
3737
* *ud_goeswith* - POS tagging and dependency parsing \
3838
using `goeswith` for subwords
39+
* *attaparse* - Thai dependency parser using Stanza and PhayaThaiBERT. \
40+
`GitHub <https://github.com/nlp-chula/attaparse>`_
3941
4042
**Options for model (esupar engine)**
4143
* *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
@@ -121,6 +123,10 @@ def dependency_parsing(
121123
if model
122124
else "KoichiYasuoka/deberta-base-thai-ud-goeswith"
123125
)
126+
elif engine == "attaparse":
127+
from pythainlp.parse.attaparse_engine import Parse # type: ignore[assignment] # noqa: I001
128+
129+
_tagger = Parse()
124130
else:
125131
raise NotImplementedError("The engine doesn't support.")
126132

tests/extra/testx_parse.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ def test_dependency_parsing(self):
1919
# self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai", tag="list"))
2020
# self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="ud_goeswith"))
2121
# self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="ud_goeswith", tag="list"))
22+
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="attaparse"))
23+
self.assertIsNotNone(
24+
dependency_parsing("ผมเป็นคนดี", engine="attaparse", tag="list")
25+
)

0 commit comments

Comments
 (0)