Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
127 changes: 127 additions & 0 deletions seacrowd/sea_datasets/hse_thai/hse_thai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import os
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@misc{HSEThaiCorpus,
title = {HSE Thai Corpus},
author = {Conneau, Alexis and Rinott, Ruty and Lample, Guillaume and Williams, Adina and Bowman, Samuel R. and Schwenk, Holger and Stoyanov, Veselin},
organization = {HSE School of Linguistics},
year = {2024},
howpublished = {\\url{http://web-corpora.net/ThaiCorpus/search}},
note = {Accessed on 2024-05-15}
}
"""

_DATASETNAME = "hse_thai"

_DESCRIPTION = """\
HSE Thai Corpus is a corpus of modern texts written in Thai language. The texts, containing in whole 50 million tokens,
were collected from various Thai websites (mostly news websites). To make it easier for non-Thai-speakers to comprehend and use texts in the corpus the researchers decided to separate words in each sentence with spaces.

The data for the corpus was collected by means of Scrapy. To tokenize texts the Pythai module was used. The text in this dataset is encoded in UTF-8.
This dataset contains text from two sources: Wikipedia and thaigov.go.th.

The former is licensed under a standard Wikipedia license, and the latter under an Open Government License for Thailand.
"""

_HOMEPAGE = "http://web-corpora.net/ThaiCorpus/search/"

_LANGUAGES = ["tha"]

_LICENSE = Licenses.OTHERS.value

_LOCAL = False

_URLS = "https://github.com/khelli07/hse-thai-for-seacrowd/raw/master/texts_tagged.zip"

_SUPPORTED_TASKS = [Tasks.LANGUAGE_IDENTIFICATION]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thing. If we looked into the original source description, it seems that the tasks can be extended outside of language identification such as translation and parts of speech tagging:

This website gives access to the HSE Thai Corpus - the corpus of modern texts written in Thai language. The texts, containing in whole 50 million tokens, were collected from various Thai websites (mostly news websites). Each token was assigned it's English translation and part of speech tag.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but I think it is from the source of the kaggle dataset! But not included in the kaggle dataset.


_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"


class HSEThaiDataset(datasets.GeneratorBasedBuilder):
"""Modern Thai corpus taken from http://web-corpora.net/ThaiCorpus/search/"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
SEACROWD_SCHEMA_NAME = "text"

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=f"{_DATASETNAME}",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}",
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
features = schemas.text_features(_LANGUAGES)

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(
self, dl_manager: datasets.DownloadManager
) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
data_dir = dl_manager.download_and_extract(_URLS)
data_dir = os.path.join(data_dir, "texts_tagged_200")

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir,
},
),
]

def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

base_path = filepath
folders = os.listdir(base_path)

id_ = 0
for folder in folders:
files = os.listdir(os.path.join(base_path, folder))
for file in files[3:]:
filepath = os.path.join(base_path, folder, file)
if filepath.endswith(".xml"):
root = ET.parse(filepath).getroot()
sentences = root.findall(".//se")
for sentence in sentences:
words = sentence.findall("w")
article = " ".join([word.text for word in words])
id_ += 1
yield id_, {
"id": str(id_),
"text": article,
"label": "tha",
}