Skip to content

Commit ffdaf3b

Browse files
authored
Merge pull request #1284 from PyThaiNLP/copilot/add-phupha-dataset
Replace TNC with Phupha word frequency dataset filtered by ORST words
2 parents 7b3d274 + f93cc5b commit ffdaf3b

File tree

5 files changed

+62440
-8
lines changed

5 files changed

+62440
-8
lines changed

pythainlp/corpus/phupha.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2+
# SPDX-FileType: SOURCE
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""Phupha: Thai Word Frequency Dataset
5+
6+
Phupha is a Thai Word Frequency Dataset from Common Crawl Corpus.
7+
8+
Dataset:
9+
Phatthiyaphaibun, W. (2026). Phupha: Thai Word Frequency Dataset
10+
[Data set]. Zenodo. https://doi.org/10.5281/zenodo.18490474
11+
12+
License:
13+
Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)
14+
"""
15+
16+
from __future__ import annotations
17+
18+
__all__: list[str] = [
19+
"word_freqs",
20+
"unigram_word_freqs",
21+
]
22+
23+
from collections import defaultdict
24+
25+
from pythainlp.corpus import get_corpus
26+
27+
_UNIGRAM_FILENAME: str = "phupha_word_freqs.txt"
28+
29+
30+
def word_freqs() -> list[tuple[str, int]]:
31+
"""Get word frequency from Phupha dataset
32+
33+
Phupha is a Thai Word Frequency Dataset from Common Crawl Corpus.
34+
35+
:return: List of tuples (word, frequency)
36+
:rtype: list[tuple[str, int]]
37+
38+
:Example:
39+
::
40+
41+
from pythainlp.corpus import phupha
42+
43+
freqs = phupha.word_freqs()
44+
print(freqs[:5])
45+
# output: [('น', 1119315948), ('ร', 1066483406), ...]
46+
47+
**Dataset Citation:**
48+
49+
Phatthiyaphaibun, W. (2026). *Phupha: Thai Word Frequency Dataset*
50+
[Data set]. Zenodo. https://doi.org/10.5281/zenodo.18490474
51+
"""
52+
freqs: list[tuple[str, int]] = []
53+
for line in get_corpus(_UNIGRAM_FILENAME):
54+
word_freq = line.split("\t")
55+
if len(word_freq) >= 2:
56+
freqs.append((word_freq[0], int(word_freq[1])))
57+
58+
return freqs
59+
60+
61+
def unigram_word_freqs() -> dict[str, int]:
62+
"""Get unigram word frequency from Phupha dataset
63+
64+
Phupha is a Thai Word Frequency Dataset from Common Crawl Corpus.
65+
66+
:return: Dictionary mapping words to their frequencies
67+
:rtype: dict[str, int]
68+
69+
:Example:
70+
::
71+
72+
from pythainlp.corpus import phupha
73+
74+
freqs = phupha.unigram_word_freqs()
75+
print(freqs.get('ไทย', 0))
76+
# output: frequency count for 'ไทย'
77+
78+
**Dataset Citation:**
79+
80+
Phatthiyaphaibun, W. (2026). *Phupha: Thai Word Frequency Dataset*
81+
[Data set]. Zenodo. https://doi.org/10.5281/zenodo.18490474
82+
"""
83+
freqs: dict[str, int] = defaultdict(int)
84+
for line in get_corpus(_UNIGRAM_FILENAME):
85+
_temp = line.strip().split("\t")
86+
if len(_temp) >= 2:
87+
freqs[_temp[0]] = int(_temp[-1])
88+
89+
return freqs

0 commit comments

Comments
 (0)