fuzzy-search / app.py
T.Masuda
update app.py
f11ed3a
import gradio as gr
import numpy as np
import re
from rapidfuzz import fuzz
class Predictor:
def __init__(self):
self.id_list = []
self.text_list = []
def _split_text(self, text: str) -> list[str]:
words = re.split('[ \t ]', text)
normalized_words = [word.strip().lower() for word in words if word.strip()]
return list(set(normalized_words))
def _normalize_text(self, text: str) -> str:
return ' '.join(self._split_text(text))
def update_text_list(self, text: str):
for line in text.replace('\r', '').split('\n'):
fields = line.split(',')
if len(fields) < 2:
return
id = fields[0].strip()
text = ' '.join(fields[1:])
try:
index = self.id_list.index(id)
self.text_list[index] = self._normalize_text(f'{self.text_list[index]} {text}')
except ValueError:
self.id_list.append(id)
self.text_list.append(self._normalize_text(text))
def _calc_score(self, text: str, keyword: str) -> float:
keywords = self._split_text(keyword)
wordlist = self._split_text(text)
return sum(map(lambda k: max(map(lambda w: fuzz.ratio(w, k), wordlist)), keywords))
def predict(self, keyword: str) -> str:
if len(self.text_list) <= 0:
print('no data')
return ''
s = np.empty(0)
for text in self.text_list:
s = np.append(s, self._calc_score(text, keyword))
index = np.argmax(s)
result_id = self.id_list[index]
result_desc = self.text_list[index]
print(f'{result_id} {result_desc}')
return result_id
def process_text(input_text: str, input_keyword: str) -> str:
if input_text is None or input_text.strip() == '':
print('no input_text')
return None
if input_keyword is None or input_keyword.strip() == '':
print('no input_keyword')
return None
p = Predictor()
p.update_text_list(input_text)
return p.predict(input_keyword)
app = gr.Interface(
title='Fuzzy Search',
fn=process_text,
inputs=[
gr.Textbox(label='text (comma separated text for id and description)', lines=10),
gr.Textbox(label='search keywords')
],
outputs=[
gr.Textbox(label='predicted id'),
],
allow_flagging='never',
concurrency_limit=20,
)
app.launch()