|
|
Coming soon |
|
|
\ No newline at end of file |
|
|
## BertTagger
|
|
|
|
|
|
```python
|
|
|
BertTagger()
|
|
|
```
|
|
|
### Parameters
|
|
|
|
|
|
#### Optional
|
|
|
|
|
|
| Parameter | Default | Type | Description |
|
|
|
| --------------------- | -------- | ------ | ------------------------------------------------------------------- |
|
|
|
| allow_standard_output | True | bool | Display info/progress messages in standard output. |
|
|
|
| autoadjust_batch_size | True | bool | If enabled, batch size is automatically adjusted based on training param `max_length` and available memory |
|
|
|
| min_available_memory | 500 (MB) | int | Minimum available GPU memory. If free GPU memory < `min_available_memory`, CPU is used instead of GPU. NB! The parameter has effect only if GPU is available. |
|
|
|
| sklearn_avg_function | "macro" | string | Average function used when calculation sklearn metrics like precision, recall and f1-score. Allowed options = ["binary", "macro", "micro", "weighted"] |
|
|
|
| use_gpu | True | bool | If enabled, uses GPU. |
|
|
|
| save_pretrained | True | bool | If enabled, saves pretrained models to local storage specified with param `pretrained_models_dir` |
|
|
|
| pretrained_models_dir | "" | string | Path to the location where the pretrained models are (or will be) saved. |
|
|
|
| logger | None | logging.Logger | Info logger for logging progress messages etc. |
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
```
|
|
|
|
|
|
## Training a model
|
|
|
```python
|
|
|
BertTagger().train(data_sample, **kwargs)
|
|
|
```
|
|
|
### Parameters
|
|
|
|
|
|
#### Required
|
|
|
|
|
|
| Parameter | Type | Description |
|
|
|
| ------------------- |------------------------ | ------------------------------------------------------------------- |
|
|
|
| data_sample | Dict[str,List[str]] | Training data of type dict, where keys = labels and values = list of examples corresponding to the label. |
|
|
|
| pos_label | string | Class used as positive when calculating sklearn metrics. NB! It should be specified if input data has *2 classes*, but doesn't affect anything with 3 or more classes and might be left unspecified then. Default to "".|
|
|
|
|
|
|
#### Optional
|
|
|
|
|
|
| Parameter | Default | Type | Description |
|
|
|
| ------------------- | -------- | ------ | ------------------------------------------------------------------- |
|
|
|
| batch_size | 32 | int | Size of one batch in data sampler.|
|
|
|
| bert_model | "bert-base-multilingual-cased" | str | Pre-trained BERT model to use. [List of all available models](https://huggingface.co/transformers/pretrained_models.html).|
|
|
|
| eps | 1e-8 | float | TODO. |
|
|
|
| lr | 2e-5 | float| Learning rate. |
|
|
|
| max_length | 32 | int | Maximum number of tokens of each training example used. Each example is truncated/padded accordingly. |
|
|
|
| n_epochs | 2 | int | Number of epochs to train. NB! 2 is usually sufficient as higher numbers already lead to overfitting on training data.|
|
|
|
|
|
|
| seed_val | 42 | int | Random seed value. |
|
|
|
| split_ratio | 0.8 | float | Ratio of input_data used for training. The rest is used for validation. |
|
|
|
|
|
|
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
data_sample = {
|
|
|
"OK": ["Happy birthday!", "What a beautiful day."],
|
|
|
"OFFENSIVE": ["You suck!", "Burn in hell."]
|
|
|
}
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
|
|
|
# we want to train a tagger for detecting offensive comments,
|
|
|
# so we set pos_label = "OFFENSIVE"
|
|
|
|
|
|
report = bert_tagger.train(data_sample, pos_label = "OFFENSIVE", bert_model = "bert-base-uncased")
|
|
|
|
|
|
# report is an instance of texta_bert_tagger.TaggingReport and
|
|
|
# contains evalutaion scores and some other information for the last training epoch. For more information, see chapter Training Reports.
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
## Tagging text
|
|
|
```python
|
|
|
BertTagger().tag_text(text)
|
|
|
```
|
|
|
### Parameters
|
|
|
|
|
|
#### Required
|
|
|
|
|
|
| Parameter | Type | Description |
|
|
|
| ---------- |------ | ------------ |
|
|
|
| text | str | Text to tag. |
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
|
|
|
# NB! Model should be trained or loaded before tagging!
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
|
|
|
# Train a model
|
|
|
|
|
|
data_sample = {
|
|
|
"OK": ["Happy birthday!", "What a beautiful day."],
|
|
|
"OFFENSIVE": ["You suck!", "Burn in hell."]
|
|
|
}
|
|
|
|
|
|
bert_tagger.train(data_sample, pos_label="OFFENSIVE", bert_model = "bert-base-uncased")
|
|
|
|
|
|
# ... And tag text with the retrieved model.
|
|
|
prediction = bert_tagger.tag("I hope you die!")
|
|
|
```
|
|
|
|
|
|
#### Output
|
|
|
|
|
|
```
|
|
|
{"prediction": "OFFENSIVE", "probability": 0.75200404}
|
|
|
```
|
|
|
|
|
|
## Tagging document
|
|
|
```python
|
|
|
BertTagger().tag_doc(doc)
|
|
|
```
|
|
|
### Parameters
|
|
|
|
|
|
#### Required
|
|
|
|
|
|
| Parameter | Type | Description |
|
|
|
| ---------- |------ | --------------------- |
|
|
|
| doc | dict | JSON document to tag. |
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
|
|
|
# NB! Model should be trained or loaded before tagging!
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
|
|
|
# Train a model
|
|
|
|
|
|
data_sample = {
|
|
|
"OK": ["Happy birthday!", "What a beautiful day."],
|
|
|
"OFFENSIVE": ["You suck!", "Burn in hell."]
|
|
|
}
|
|
|
|
|
|
bert_tagger.train(data_sample, pos_label="OFFENSIVE", bert_model = "bert-base-uncased")
|
|
|
|
|
|
# ... And tag docment with the retrieved model.
|
|
|
|
|
|
doc = {
|
|
|
"body": "I hope this helps to emphasize, I hope this helps to clarify, I hope you die!",
|
|
|
"title": "I hope you die"
|
|
|
}
|
|
|
# NB! All fields in the document are combined for predicting.
|
|
|
prediction = bert_tagger.tag(doc)
|
|
|
print(prediction)
|
|
|
```
|
|
|
|
|
|
#### Output
|
|
|
|
|
|
```
|
|
|
{"prediction": "OFFENSIVE", "probability": 0.75200404}
|
|
|
```
|
|
|
|
|
|
|
|
|
## Saving a model
|
|
|
```python
|
|
|
BertTagger().save(path)
|
|
|
```
|
|
|
|
|
|
### Parameters
|
|
|
|
|
|
#### Required
|
|
|
|
|
|
| Parameter | Type | Description |
|
|
|
| ------------------- |------- | ------------------------ |
|
|
|
| path | str | Full path to model file. |
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
|
|
|
# Train a model
|
|
|
|
|
|
data_sample = {
|
|
|
"OK": ["Happy birthday!", "What a beautiful day."],
|
|
|
"OFFENSIVE": ["You suck!", "Burn in hell."]
|
|
|
}
|
|
|
|
|
|
bert_tagger.train(data_sample, pos_label="OFFENSIVE", bert_model = "bert-base-uncased")
|
|
|
|
|
|
# Saving the model
|
|
|
bert_tagger.save("/home/bert_models/en_offensive")
|
|
|
```
|
|
|
|
|
|
|
|
|
## Loading a model
|
|
|
```python
|
|
|
BertTagger().load(path)
|
|
|
```
|
|
|
|
|
|
### Parameters
|
|
|
|
|
|
#### Required
|
|
|
|
|
|
| Parameter | Type | Description |
|
|
|
| ------------------- |------- | ------------------------ |
|
|
|
| path | str | Full path to model file. |
|
|
|
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
|
|
|
bert_tagger.load("/home/bert_models/en_offensive")
|
|
|
```
|
|
|
|
|
|
## Retrieving training reports
|
|
|
|
|
|
### Example
|
|
|
|
|
|
```python
|
|
|
from texta_bert_tagger.tagger import BertTagger
|
|
|
|
|
|
# For retrieving the report(s), a model must be trained
|
|
|
|
|
|
data_sample = {
|
|
|
"OK": ["Happy birthday!", "What a beautiful day."],
|
|
|
"OFFENSIVE": ["You suck!", "Burn in hell."]
|
|
|
}
|
|
|
|
|
|
bert_tagger = BertTagger()
|
|
|
|
|
|
# we want to train a tagger for detecting offensive comments,
|
|
|
# so we set pos_label = "OFFENSIVE"
|
|
|
|
|
|
report = bert_tagger.train(data_sample, pos_label = "OFFENSIVE", bert_model = "bert-base-uncased")
|
|
|
|
|
|
# Get report content:
|
|
|
report_as_dict = report.to_dict()
|
|
|
|
|
|
print(report_as_dict)
|
|
|
```
|
|
|
|
|
|
#### Output
|
|
|
|
|
|
```
|
|
|
{
|
|
|
"f1_score": 0.85714,
|
|
|
"precision": 0.83565,
|
|
|
"recall": 0.87978,
|
|
|
"confusion_matrix": [[783,107],[154,746]],
|
|
|
"accuracy": 0.85419,
|
|
|
"training_loss": 0.29599,
|
|
|
"validation_loss": 0.33366,
|
|
|
"training_time": "0:01:08",
|
|
|
"validation_time": "0:00:04",
|
|
|
"area_under_curve": 0.93825,
|
|
|
"classes": ["OFFENSIVE","OK"],
|
|
|
"epoch": 2
|
|
|
}
|
|
|
```
|
|
|
|
|
|
All the reports for each epoch can be retrieved followingly:
|
|
|
|
|
|
```python
|
|
|
reports = bert_tagger.epoch_reports
|
|
|
|
|
|
# Convert the reports to dicts
|
|
|
reports = [r.to_dict() for r in reports]
|
|
|
``` |
|
|
\ No newline at end of file |