File size: 5,592 Bytes
618357a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
from streamlit.logger import get_logger
import gematriapy

from timeit import default_timer as timer
import sqlite3
import pandas as pd
import ast
import pymongo



LOGGER = get_logger(__name__)
    
@st.cache_resource
def get_dfs()->object:
    import pandas as pd
    
    def to_daf_long(i:int)->str:
        if i>0 and i<999:
            i+=1
            if  i%2 ==0:
                return gematriapy.to_hebrew(i//2)+' ืขืžื•ื“ ื '
            else: 
                return gematriapy.to_hebrew(i//2)+' ืขืžื•ื“ ื‘'
        return i

    def gematria(i)->str:
        if type(i) == int and i>0 and i<999:
            return gematriapy.to_hebrew(i) + ' '
        else: return i if type(i)==str else ''
        
    # //get the books table//
    print('hello from get_dfs..')
    # Connect to the database
    conn = sqlite3.connect('test42.db')

    # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM books")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    books = pd.DataFrame(list(results))
    books.columns=list(map(lambda x: x[0], cursor.description))
    
    # convert the array format string "["Section","Section"]"  that came from the database into a real array [Section,Section]
    books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] )

    # //get the texts table//

    # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM texts")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    texts = pd.DataFrame(results)
    texts.columns=list(map(lambda x: x[0], cursor.description))

    # get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
  # Query the database and retrieve the results
    cursor = conn.execute("SELECT * FROM titles")
    results = cursor.fetchall()

    # Convert the query results into a Pandas DataFrame
    titles = pd.DataFrame(results)
    titles.columns=list(map(lambda x: x[0], cursor.description))
    # merge the texts with the original books table (without the extra hebrew titles)
    merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
    
    #convert the Talmud marks (1,2,3...) into dafs (ื ืขืžื•ื“ ื..)
    has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'ื“ืฃ' else False)==True]
    merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long)
    
    # create a reference text, for exapmle: ืจืฉ"ื™ ืขืœ ื‘ืจืืฉื™ืช ืคืจืง ื ืคืกื•ืง ื
    merged['ref_text_long']= merged['heTitle'] + ' ' + \
        merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "")  + merged['level4'].map(gematria) + \
        merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "")  + merged['level3'].map(gematria) + \
        merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "")  +  merged['level2'].map(gematria)  
    
    titles_df = titles
    texts_df = merged
    return titles_df, texts_df
    

def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
    from rapidfuzz import process as rapidfuzz_process
    print('hello from find_ref..')
    if not input_text: return
    
    results = []    
    books = titles_df['he_titles']
    input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
    
    # search only the references database in case the user set the top_k to 0
    if top_k == 0:
        refs = texts_df['ref_text_long'].unique()
        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
           results += [{'ref':ref,'ref_score':ref_score}]
    
    else:
        # search first only in the books database (for top_k books)
        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
            # get all the references of that book
            book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
            refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
            # then search these references and add them all to the results
            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
                results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
        # finaly, sort all the references by their own score (and not the book score)
        results.sort(key=lambda x: x['ref_score'],reverse=True)
    
    return results[:num_of_results]


def run():
    
    st.set_page_config(
        page_title=" ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช",
        page_icon="๐Ÿ“š",
        layout="wide",
        initial_sidebar_state="expanded"    
    )
    get_dfs()
    st.write("# ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช ื‘ืืžืฆืขื•ืช ืžืจื—ืง ืœื•ื™ื ืฉื˜ื™ื™ืŸ")

    titles_df,texts_df = get_dfs()
    user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘') 
    top_k =  st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
    num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
    
    if user_input!="":
        time0 = timer()
        results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
        time = f"finished in {1e3*(timer()-time0):.1f} ms"
        st.write(time)
        for result in results:
            st.write(result)

if __name__ == "__main__":
    run()