Spaces:
Sleeping
Sleeping
File size: 1,644 Bytes
d7c8091 26ac453 6afda5c d7c8091 6085bca d7c8091 6085bca 4a8eac5 6085bca 4a8eac5 2c439c5 4a8eac5 6085bca d7c8091 26ac453 7442c2a 1faf45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from io import StringIO, BytesIO
import gradio as gr
from pdfminer.high_level import extract_text
from transformers import pipeline
import pandas as pd
import numpy as np
nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing")
class Group():
def __init__(self):
self.id = 0
self.text = ''
def getgroup(self,text):
if self.text == text:
return self.id
else:
self.id +=1
self.text = text
return self.id
grp_gen = Group()
def entities_to_df(entities):
df = pd.DataFrame(entities)
df['entity'] = df['entity'].apply(lambda x: x[2:])
df['group'] = df['entity'].apply(grp_gen.getgroup)
group_tag = df.groupby(by='group')
img_tagging = group_tag.agg({
'start':min,
'end':max,
'entity':np.unique,
'word':lambda x: " ".join(x)
})
return img_tagging
def transform_entity_type(entities):
for d in entities:
d['entity'] = d['entity'][0]
return entities
def highlight_text(fileObj):
path = BytesIO(fileObj)
text = extract_text(path)
entities = nlp(text)
df = entities_to_df(entities)
entities = df.to_dict('records')
entities = transform_entity_type(entities)
return {"text": text, "entities": entities}
examples = ['Beiersdorf sees slower sales this year after bumper 2022 By Reuters.pdf']
gr.Interface(fn=highlight_text,
inputs=gr.inputs.File(file_count="single", type="bytes"),
outputs=gr.HighlightedText(),
examples=examples
).launch()
|