Models / Datasets
Collection
10 items
•
Updated
•
1
Language | Dataset | Source | Download |
---|---|---|---|
all |
Punctuation | PUNCTUATION.txt |
|
vi |
Synonyms | source | VI_SYNONYMS.json |
vi |
Vocab | source | VI_VOCAB.txt |
vi |
Diacritics | VI_DIACRITICS.txt |
|
vi |
Stopwords | source | VI_STOPWORDS.txt |
en |
Stopwords | nltk | EN_STOPWORDS.txt |
import requests
punctuation = requests.get("https://huggingface.co/onelevelstudio/NLPT/raw/main/PUNCTUATION.txt").text.splitlines()
from huggingface_hub import hf_hub_download as HF_Download
import json
with open(HF_Download(repo_id="onelevelstudio/NLPT", filename="PUNCTUATION.txt"), mode="r", encoding="utf-8") as f:
DATASET_punctuation = set(f.read().splitlines())
with open(HF_Download(repo_id="onelevelstudio/NLPT", filename="VI_DIACRITICS.txt"), mode="r", encoding="utf-8") as f:
DATASET_diacritics_vi = f.read().splitlines()
with open(HF_Download(repo_id="onelevelstudio/NLPT", filename="VI_VOCAB.txt"), mode="r", encoding="utf-8") as f:
DATASET_vocab_vi = f.read().splitlines()
with open(HF_Download(repo_id="onelevelstudio/NLPT", filename="VI_STOPWORDS.txt"), mode="r", encoding="utf-8") as f:
DATASET_stopwords_vi = f.read().splitlines()
with open(HF_Download(repo_id="onelevelstudio/NLPT", filename="EN_STOPWORDS.txt"), mode="r", encoding="utf-8") as f:
DATASET_stopwords_en = f.read().splitlines()
with open(HF_Download(repo_id="onelevelstudio/NLPT", filename="VI_SYNONYMS.json"), mode="r", encoding="utf-8") as f:
DATASET_synonyms_vi = json.load(f)