MOHAMMED-N commited on
Commit
33c81c1
·
verified ·
1 Parent(s): 0f7fcd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -27
app.py CHANGED
@@ -1,21 +1,19 @@
1
  import streamlit as st
2
  import os
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
-
5
- # --- LANGCHAIN IMPORTS ---
6
  from langchain_community.document_loaders import PyPDFLoader
7
  from langchain_experimental.text_splitter import SemanticChunker
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import FAISS
10
  from langchain.memory import ConversationBufferMemory
11
 
12
- # 1) SET UP PAGE
13
  st.title("💬 المحادثة التفاعلية - إدارة البيانات وحماية البيانات الشخصية")
14
  local_file = "Policies001.pdf"
15
 
16
  index_folder = "faiss_index"
17
 
18
- # Inject custom CSS for right-to-left text
19
  st.markdown(
20
  """
21
  <style>
@@ -28,15 +26,17 @@ st.markdown(
28
  unsafe_allow_html=True
29
  )
30
 
31
- # 2) LOAD OR BUILD VECTORSTORE
32
  embeddings = HuggingFaceEmbeddings(
33
  model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
34
  model_kwargs={"trust_remote_code": True}
35
  )
36
 
37
  if os.path.exists(index_folder):
 
38
  vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
39
  else:
 
40
  loader = PyPDFLoader(local_file)
41
  documents = loader.load()
42
 
@@ -47,68 +47,65 @@ else:
47
  )
48
  chunked_docs = text_splitter.split_documents(documents)
49
 
 
50
  vectorstore = FAISS.from_documents(chunked_docs, embeddings)
51
  vectorstore.save_local(index_folder)
52
 
53
- # 3) CREATE RETRIEVER
54
  retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
55
 
56
- # 4) SET UP "COMMAND-R7B-ARABIC" AS LLM
57
- # Authenticate and load the model
58
- model_name = "CohereForAI/c4ai-command-r7b-arabic-02-2025" # Replace with the actual Hugging Face model ID
59
-
60
- # Set Hugging Face token securely
61
- hf_token = os.getenv("HF_TOKEN") # Ensure you set your token as an environment variable in Hugging Face Spaces
62
 
 
 
63
  if hf_token is None:
64
  st.error("Hugging Face token not found. Please set the 'HF_TOKEN' environment variable.")
65
  st.stop()
66
 
67
- # Load tokenizer and model using the token
68
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
69
  model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token)
70
 
71
- # Hugging Face pipeline for text generation
72
  qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
73
 
74
- # Memory object to store conversation
75
  memory = ConversationBufferMemory(
76
- memory_key="chat_history", # key used internally by the chain
77
- return_messages=True # ensures we get the entire message history
78
  )
79
 
80
- # 5) MANAGE SESSION STATE FOR UI CHAT
81
  if "messages" not in st.session_state:
82
  st.session_state["messages"] = [
83
  {"role": "assistant", "content": "👋 مرحبًا! اسألني أي شيء عن إدارة البيانات وحماية البيانات الشخصية!"}
84
  ]
85
 
86
- # Display existing messages in chat format
87
  for msg in st.session_state["messages"]:
88
  with st.chat_message(msg["role"]):
89
- # Apply the "rtl" class to style Arabic text correctly
90
  st.markdown(f'<div class="rtl">{msg["content"]}</div>', unsafe_allow_html=True)
91
 
92
- # 6) CHAT INPUT
93
  user_input = st.chat_input("اكتب سؤالك هنا")
94
 
95
- # 7) PROCESS NEW USER MESSAGE
96
  if user_input:
97
- # a) Display user message in UI
98
  st.session_state["messages"].append({"role": "user", "content": user_input})
99
  with st.chat_message("user"):
100
  st.markdown(f'<div class="rtl">{user_input}</div>', unsafe_allow_html=True)
101
 
102
- # b) Run pipeline to generate a response
103
- # Combine retriever results and user input for context-aware answering
104
  retrieved_docs = retriever.get_relevant_documents(user_input)
105
  context = "\n".join([doc.page_content for doc in retrieved_docs])
106
  full_input = f"السياق:\n{context}\n\nالسؤال:\n{user_input}"
107
 
108
- # Generate answer using the pipeline
109
  response = qa_pipeline(full_input, max_length=500, num_return_sequences=1)[0]["generated_text"]
110
 
111
- # c) Display assistant response
112
  st.session_state["messages"].append({"role": "assistant", "content": response})
113
  with st.chat_message("assistant"):
114
  st.markdown(f'<div class="rtl">{response}</div>', unsafe_allow_html=True)
 
1
  import streamlit as st
2
  import os
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
 
4
  from langchain_community.document_loaders import PyPDFLoader
5
  from langchain_experimental.text_splitter import SemanticChunker
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain.memory import ConversationBufferMemory
9
 
10
+ # --- 1) إعداد الصفحة ---
11
  st.title("💬 المحادثة التفاعلية - إدارة البيانات وحماية البيانات الشخصية")
12
  local_file = "Policies001.pdf"
13
 
14
  index_folder = "faiss_index"
15
 
16
+ # إضافة CSS مخصص لدعم النصوص من اليمين لليسار
17
  st.markdown(
18
  """
19
  <style>
 
26
  unsafe_allow_html=True
27
  )
28
 
29
+ # --- 2) تحميل أو بناء قاعدة بيانات FAISS ---
30
  embeddings = HuggingFaceEmbeddings(
31
  model_name="CAMeL-Lab/bert-base-arabic-camelbert-mix",
32
  model_kwargs={"trust_remote_code": True}
33
  )
34
 
35
  if os.path.exists(index_folder):
36
+ # تحميل قاعدة البيانات إذا كانت موجودة
37
  vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
38
  else:
39
+ # تحميل PDF وتقسيم النصوص
40
  loader = PyPDFLoader(local_file)
41
  documents = loader.load()
42
 
 
47
  )
48
  chunked_docs = text_splitter.split_documents(documents)
49
 
50
+ # إنشاء قاعدة بيانات FAISS
51
  vectorstore = FAISS.from_documents(chunked_docs, embeddings)
52
  vectorstore.save_local(index_folder)
53
 
54
+ # --- 3) إعداد المسترجع ---
55
  retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
56
 
57
+ # --- 4) إعداد نموذج النص ---
58
+ model_name = "CohereForAI/c4ai-command-r7b-arabic-02-2025" # اسم النموذج
 
 
 
 
59
 
60
+ # التأكد من وجود توكن Hugging Face
61
+ hf_token = os.getenv("HF_TOKEN")
62
  if hf_token is None:
63
  st.error("Hugging Face token not found. Please set the 'HF_TOKEN' environment variable.")
64
  st.stop()
65
 
66
+ # تحميل النموذج والمحول
67
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
68
  model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token)
69
 
70
+ # إعداد pipeline لتوليد النصوص
71
  qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
72
 
73
+ # --- 5) إعداد الذاكرة ---
74
  memory = ConversationBufferMemory(
75
+ memory_key="chat_history",
76
+ return_messages=True
77
  )
78
 
79
+ # --- 6) إدارة رسائل المستخدم ---
80
  if "messages" not in st.session_state:
81
  st.session_state["messages"] = [
82
  {"role": "assistant", "content": "👋 مرحبًا! اسألني أي شيء عن إدارة البيانات وحماية البيانات الشخصية!"}
83
  ]
84
 
85
+ # عرض الرسائل الحالية
86
  for msg in st.session_state["messages"]:
87
  with st.chat_message(msg["role"]):
 
88
  st.markdown(f'<div class="rtl">{msg["content"]}</div>', unsafe_allow_html=True)
89
 
90
+ # --- 7) إدخال المستخدم ---
91
  user_input = st.chat_input("اكتب سؤالك هنا")
92
 
93
+ # --- 8) معالجة رسالة المستخدم ---
94
  if user_input:
95
+ # عرض رسالة المستخدم
96
  st.session_state["messages"].append({"role": "user", "content": user_input})
97
  with st.chat_message("user"):
98
  st.markdown(f'<div class="rtl">{user_input}</div>', unsafe_allow_html=True)
99
 
100
+ # استرجاع المستندات ذات الصلة
 
101
  retrieved_docs = retriever.get_relevant_documents(user_input)
102
  context = "\n".join([doc.page_content for doc in retrieved_docs])
103
  full_input = f"السياق:\n{context}\n\nالسؤال:\n{user_input}"
104
 
105
+ # توليد الإجابة باستخدام النموذج
106
  response = qa_pipeline(full_input, max_length=500, num_return_sequences=1)[0]["generated_text"]
107
 
108
+ # عرض الإجابة
109
  st.session_state["messages"].append({"role": "assistant", "content": response})
110
  with st.chat_message("assistant"):
111
  st.markdown(f'<div class="rtl">{response}</div>', unsafe_allow_html=True)