Spaces:
Sleeping
Sleeping
import chardet | |
import spacy | |
from spacy.cli import download | |
# ------------------------ | |
# CONFIGURATION | |
# ------------------------ | |
custom_spacy_config = { | |
"gliner_model": "urchade/gliner_multi_pii-v1", | |
"labels": [ | |
"person", "organization", "company", "country", | |
"medical condition", "credit card brand", | |
], | |
"threshold": 0.39, | |
"style": "ent", | |
} | |
# Load SpaCy and add GLiNER to the pipeline | |
try: | |
nlp = spacy.load("en_core_web_lg") | |
except OSError: | |
download("en_core_web_lg") | |
nlp = spacy.load("en_core_web_lg") | |
nlp.add_pipe("gliner_spacy", config=custom_spacy_config) | |
def detect_encoding(file_bytes): | |
result = chardet.detect(file_bytes) | |
return result.get('encoding', 'utf-8') | |
def extract_entities_from_file(file_path): | |
with open(file_path, "rb") as f: | |
file_bytes = f.read() | |
encoding = detect_encoding(file_bytes) | |
text = file_bytes.decode(encoding, errors='ignore') | |
doc = nlp(text) | |
results = [(ent.text, ent.label_) for ent in doc.ents] | |
return results | |