Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
- [Corpus](./docs/README.md#bengali-corpus-class)
- Letters, vowels, punctuations, stopwords
- [Command Line Interface (CLI)](#command-line-interface)
- [Pipeline API](#pipeline-api)
- [Batch Processing](#batch-processing)
- [Async Model Loading](#async-model-loading)

## Installation

Expand Down Expand Up @@ -128,6 +131,138 @@ bnlp tokenize "আমি বাংলায় গান গাই। তুম
bnlp embedding "বাংলা" --similar --topn 5
```

## Pipeline API

Chain multiple NLP operations together using the Pipeline API.

```python
from bnlp import Pipeline, CleanText, BasicTokenizer

# Create a pipeline
pipeline = Pipeline([
CleanText(remove_url=True, remove_punct=True),
BasicTokenizer(),
])

# Process text through the pipeline
result = pipeline("আমি বাংলায় গান গাই।")
print(result)
# Output: ['আমি', 'বাংলায়', 'গান', 'গাই']

# Get detailed results with intermediate outputs
result = pipeline.run("আমি বাংলায় গান গাই।", return_details=True)
print(result.intermediate_results)
```

### Pre-built Pipelines

```python
from bnlp import create_tokenization_pipeline, create_ner_pipeline, create_pos_pipeline

# Tokenization pipeline
tokenizer_pipeline = create_tokenization_pipeline(clean=True, tokenizer_type="basic")
tokens = tokenizer_pipeline("আমি বাংলায় গান গাই।")

# NER pipeline
ner_pipeline = create_ner_pipeline(clean=True)
entities = ner_pipeline("সজীব ঢাকায় থাকেন।")

# POS pipeline
pos_pipeline = create_pos_pipeline(clean=True)
tags = pos_pipeline("আমি ভাত খাই।")
```

## Batch Processing

Process multiple texts efficiently using batch processing utilities.

```python
from bnlp import BasicTokenizer, tokenize_batch, tag_batch, clean_batch
from bnlp import BengaliNER, CleanText

# Batch tokenization
tokenizer = BasicTokenizer()
texts = ["আমি বাংলায় গান গাই।", "তুমি কোথায় যাও?", "সে বই পড়ে।"]
results = tokenize_batch(tokenizer.tokenize, texts)
print(results)
# Output: [['আমি', 'বাংলায়', ...], ['তুমি', 'কোথায়', ...], ['সে', 'বই', ...]]

# Batch NER tagging
ner = BengaliNER()
texts = ["সজীব ঢাকায় থাকেন।", "রবীন্দ্রনাথ ঠাকুর কলকাতায় জন্মগ্রহণ করেন।"]
results = tag_batch(ner.tag, texts)

# Batch text cleaning
cleaner = CleanText(remove_url=True, remove_email=True)
texts = ["email@example.com আমি", "https://example.com তুমি"]
results = clean_batch(cleaner, texts)
```

### Using BatchProcessor

```python
from bnlp import BatchProcessor, BasicTokenizer

tokenizer = BasicTokenizer()
batch = BatchProcessor(tokenizer.tokenize, max_workers=4)

texts = ["আমি বাংলায় গান গাই।"] * 100
results = batch.process(texts, show_progress=True)
```

## Async Model Loading

Load large models in the background without blocking your application.

```python
from bnlp import AsyncModelLoader, BengaliWord2Vec

# Create async loader with callbacks
def on_progress(progress):
print(f"Loading: {progress.progress * 100:.0f}% - {progress.message}")

loader = AsyncModelLoader(
BengaliWord2Vec,
on_progress=on_progress,
on_complete=lambda m: print("Model ready!")
)

# Start loading in background
loader.start_loading()

# Do other work while model loads...
print("Doing other work...")

# Get model when needed (blocks until ready)
model = loader.get_model()
vector = model.get_word_vector("বাংলা")
```

### Lazy Loading

```python
from bnlp import LazyModelLoader, BengaliWord2Vec

# Model not loaded yet
lazy_model = LazyModelLoader(BengaliWord2Vec)

# Model loads on first access
model = lazy_model.get()
vector = model.get_word_vector("বাংলা")
```

### Quick Async Loading

```python
from bnlp import load_model_async, BengaliWord2Vec

# One-liner to start async loading
loader = load_model_async(BengaliWord2Vec)

# Get model when ready
model = loader.get_model()
```

## Documentation
Full documentation are available [here](https://sagorbrur.github.io/bnlp/)

Expand Down
31 changes: 28 additions & 3 deletions bnlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

__version__ = "4.0.3"
__version__ = "4.1.0"

import os
from bnlp.tokenizer.basic import BasicTokenizer
from bnlp.tokenizer.nltk import NLTKTokenizer
from bnlp.tokenizer.sentencepiece import (
SentencepieceTokenizer,
SentencepieceTokenizer,
SentencepieceTrainer,
)

Expand All @@ -16,7 +16,7 @@
from bnlp.embedding.glove import BengaliGlove

from bnlp.embedding.doc2vec import (
BengaliDoc2vec,
BengaliDoc2vec,
BengaliDoc2vecTrainer,
)

Expand All @@ -27,3 +27,28 @@
from bnlp.cleantext.clean import CleanText

from bnlp.corpus.corpus import BengaliCorpus

# Core module - Protocols, Pipeline, Exceptions, Batch Processing, Async Loading
from bnlp.core import (
# Pipeline
Pipeline,
PipelineStep,
PipelineResult,
create_tokenization_pipeline,
create_ner_pipeline,
create_pos_pipeline,
# Batch Processing
BatchProcessor,
tokenize_batch,
embed_batch,
tag_batch,
clean_batch,
# Async Loading
AsyncModelLoader,
LazyModelLoader,
load_model_async,
# Exceptions
BNLPException,
ModelNotFoundError,
ModelLoadError,
)
98 changes: 98 additions & 0 deletions bnlp/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
BNLP Core Module

This module provides core abstractions, protocols, and utilities for BNLP.
"""

from bnlp.core.protocols import (
TokenizerProtocol,
BatchTokenizerProtocol,
EmbeddingProtocol,
BatchEmbeddingProtocol,
SimilarityEmbeddingProtocol,
DocumentEmbeddingProtocol,
TaggerProtocol,
BatchTaggerProtocol,
TextProcessorProtocol,
PipelineStepProtocol,
)

from bnlp.core.pipeline import (
Pipeline,
PipelineStep,
PipelineResult,
create_tokenization_pipeline,
create_ner_pipeline,
create_pos_pipeline,
)

from bnlp.core.exceptions import (
BNLPException,
ModelNotFoundError,
ModelLoadError,
TokenizationError,
EmbeddingError,
TaggingError,
DownloadError,
PipelineError,
InvalidInputError,
)

from bnlp.core.batch import (
BatchProcessor,
tokenize_batch,
embed_batch,
tag_batch,
clean_batch,
)

from bnlp.core.async_loader import (
AsyncModelLoader,
LazyModelLoader,
LoadingStatus,
LoadingProgress,
load_model_async,
)

__all__ = [
# Protocols
"TokenizerProtocol",
"BatchTokenizerProtocol",
"EmbeddingProtocol",
"BatchEmbeddingProtocol",
"SimilarityEmbeddingProtocol",
"DocumentEmbeddingProtocol",
"TaggerProtocol",
"BatchTaggerProtocol",
"TextProcessorProtocol",
"PipelineStepProtocol",
# Pipeline
"Pipeline",
"PipelineStep",
"PipelineResult",
"create_tokenization_pipeline",
"create_ner_pipeline",
"create_pos_pipeline",
# Exceptions
"BNLPException",
"ModelNotFoundError",
"ModelLoadError",
"TokenizationError",
"EmbeddingError",
"TaggingError",
"DownloadError",
"PipelineError",
"InvalidInputError",
# Batch Processing
"BatchProcessor",
"tokenize_batch",
"embed_batch",
"tag_batch",
"clean_batch",
# Async Loading
"AsyncModelLoader",
"LazyModelLoader",
"LoadingStatus",
"LoadingProgress",
"load_model_async",
]
Loading
Loading