gliner-test / main.py
goberoi's picture
Update main.py
34abba7 verified
import chardet
import spacy
from spacy.cli import download
# ------------------------
# CONFIGURATION
# ------------------------
custom_spacy_config = {
"gliner_model": "urchade/gliner_multi_pii-v1",
"labels": [
"person", "organization", "company", "country",
"medical condition", "credit card brand",
],
"threshold": 0.39,
"style": "ent",
}
# Load SpaCy and add GLiNER to the pipeline
try:
nlp = spacy.load("en_core_web_lg")
except OSError:
download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("gliner_spacy", config=custom_spacy_config)
def detect_encoding(file_bytes):
result = chardet.detect(file_bytes)
return result.get('encoding', 'utf-8')
def extract_entities_from_file(file_path):
with open(file_path, "rb") as f:
file_bytes = f.read()
encoding = detect_encoding(file_bytes)
text = file_bytes.decode(encoding, errors='ignore')
doc = nlp(text)
results = [(ent.text, ent.label_) for ent in doc.ents]
return results