File size: 5,592 Bytes
618357a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import streamlit as st
from streamlit.logger import get_logger
import gematriapy
from timeit import default_timer as timer
import sqlite3
import pandas as pd
import ast
import pymongo
LOGGER = get_logger(__name__)
@st.cache_resource
def get_dfs()->object:
import pandas as pd
def to_daf_long(i:int)->str:
if i>0 and i<999:
i+=1
if i%2 ==0:
return gematriapy.to_hebrew(i//2)+' ืขืืื ื '
else:
return gematriapy.to_hebrew(i//2)+' ืขืืื ื'
return i
def gematria(i)->str:
if type(i) == int and i>0 and i<999:
return gematriapy.to_hebrew(i) + ' '
else: return i if type(i)==str else ''
# //get the books table//
print('hello from get_dfs..')
# Connect to the database
conn = sqlite3.connect('test42.db')
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM books")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
books = pd.DataFrame(list(results))
books.columns=list(map(lambda x: x[0], cursor.description))
# convert the array format string "["Section","Section"]" that came from the database into a real array [Section,Section]
books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] )
# //get the texts table//
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM texts")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
texts = pd.DataFrame(results)
texts.columns=list(map(lambda x: x[0], cursor.description))
# get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
# Query the database and retrieve the results
cursor = conn.execute("SELECT * FROM titles")
results = cursor.fetchall()
# Convert the query results into a Pandas DataFrame
titles = pd.DataFrame(results)
titles.columns=list(map(lambda x: x[0], cursor.description))
# merge the texts with the original books table (without the extra hebrew titles)
merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
#convert the Talmud marks (1,2,3...) into dafs (ื ืขืืื ื..)
has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'ืืฃ' else False)==True]
merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long)
# create a reference text, for exapmle: ืจืฉ"ื ืขื ืืจืืฉืืช ืคืจืง ื ืคืกืืง ื
merged['ref_text_long']= merged['heTitle'] + ' ' + \
merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "") + merged['level4'].map(gematria) + \
merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "") + merged['level3'].map(gematria) + \
merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "") + merged['level2'].map(gematria)
titles_df = titles
texts_df = merged
return titles_df, texts_df
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
from rapidfuzz import process as rapidfuzz_process
print('hello from find_ref..')
if not input_text: return
results = []
books = titles_df['he_titles']
input_text = input_text.replace(':','ืขืืื ื').replace('.','ืขืืื ื')
# search only the references database in case the user set the top_k to 0
if top_k == 0:
refs = texts_df['ref_text_long'].unique()
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
results += [{'ref':ref,'ref_score':ref_score}]
else:
# search first only in the books database (for top_k books)
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
# get all the references of that book
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
# then search these references and add them all to the results
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
# finaly, sort all the references by their own score (and not the book score)
results.sort(key=lambda x: x['ref_score'],reverse=True)
return results[:num_of_results]
def run():
st.set_page_config(
page_title=" ืืืคืืฉ ืืงืืจืืช",
page_icon="๐",
layout="wide",
initial_sidebar_state="expanded"
)
get_dfs()
st.write("# ืืืคืืฉ ืืงืืจืืช ืืืืฆืขืืช ืืจืืง ืืืื ืฉืืืื")
titles_df,texts_df = get_dfs()
user_input = st.text_input('ืืชืื ืืช ืืืงืืจ ืืืืืงืฉ', placeholder='ืืื ืงืื ืืฃ ื ืขืืื ื')
top_k = st.sidebar.slider('ืืื ืกืคืจืื ืืกืจืืง top_k:',0,20,10)
num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5)
if user_input!="":
time0 = timer()
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
time = f"finished in {1e3*(timer()-time0):.1f} ms"
st.write(time)
for result in results:
st.write(result)
if __name__ == "__main__":
run()
|