Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pdfplumber | |
| import re | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from gtts import gTTS | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from nltk.sentiment import SentimentIntensityAnalyzer | |
| # Initialize necessary components | |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) | |
| qa_pipeline = pipeline("question-answering") | |
| sia = SentimentIntensityAnalyzer() | |
| # Helper functions | |
| def extract_text_from_pdf(file): | |
| with pdfplumber.open(file) as pdf: | |
| text = ''.join([page.extract_text() for page in pdf.pages]) | |
| return text | |
| def clean_text(text): | |
| text = re.sub(r'\s*Page \d+\s*', '', text) # Remove page numbers | |
| return text.strip() | |
| def chunk_text(text, max_tokens=1024): | |
| words = text.split() | |
| chunks, current_chunk, current_token_count = [], [], 0 | |
| for word in words: | |
| token_count = len(tokenizer(word)['input_ids']) | |
| if current_token_count + token_count > max_tokens: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk, current_token_count = [], 0 | |
| current_chunk.append(word) | |
| current_token_count += token_count | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def summarize_text_qwen(text, max_length=800): | |
| input_text = f"summarize: {text}" | |
| tokens = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024) | |
| summary_ids = model.generate( | |
| tokens["input_ids"], max_length=max_length, min_length=200, | |
| length_penalty=2.0, num_beams=4, early_stopping=True | |
| ) | |
| return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| def summarize_large_document(text, max_length=800): | |
| chunks = chunk_text(text) | |
| summaries = [summarize_text_qwen(chunk, max_length=max_length) for chunk in chunks] | |
| return " ".join(summaries) | |
| def answer_question_with_context(question, context, chunk_size=500): | |
| chunks = chunk_text(context, max_tokens=chunk_size) | |
| answers = [] | |
| for chunk in chunks: | |
| try: | |
| answers.append(qa_pipeline({'question': question, 'context': chunk})['answer']) | |
| except: | |
| continue | |
| return " ".join(answers) | |
| # Replace Tortoise-TTS with gTTS for text-to-speech functionality | |
| def text_to_speech(text, language="en"): | |
| tts = gTTS(text=text, lang=language, slow=False) | |
| file_name = "output_audio.mp3" | |
| tts.save(file_name) | |
| return file_name | |
| def extract_keywords(text, top_n=10): | |
| vectorizer = CountVectorizer(stop_words="english") | |
| word_counts = vectorizer.fit_transform([text]) | |
| keywords = sorted( | |
| zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]), | |
| key=lambda x: x[1], reverse=True | |
| )[:top_n] | |
| return [word for word, count in keywords] | |
| def analyze_sentiment(text): | |
| return sia.polarity_scores(text) | |
| # Streamlit App Interface | |
| st.title("Enhanced PDF to Audiobook App") | |
| st.markdown("### Turn documents into interactive audiobooks with advanced features.") | |
| uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
| if uploaded_file: | |
| with st.spinner("Extracting and cleaning PDF content..."): | |
| raw_text = extract_text_from_pdf(uploaded_file) | |
| cleaned_text = clean_text(raw_text) | |
| st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.") | |
| if st.button("Summarize Document"): | |
| with st.spinner("Summarizing document..."): | |
| summary = summarize_large_document(cleaned_text, max_length=800) | |
| st.text_area("Summary", summary, height=300) | |
| if st.button("Convert Summary to Audiobook"): | |
| with st.spinner("Generating audio..."): | |
| audio_file = text_to_speech(summary) | |
| st.audio(audio_file, format="audio/mp3", start_time=0) | |
| st.markdown("### Ask Questions About the Document") | |
| question = st.text_input("Your Question:") | |
| if question: | |
| with st.spinner("Answering your question..."): | |
| answer = answer_question_with_context(question, cleaned_text) | |
| st.write(f"**Answer:** {answer}") | |
| if st.button("Convert Answer to Audio"): | |
| with st.spinner("Generating answer audio..."): | |
| answer_audio_file = text_to_speech(answer) | |
| st.audio(answer_audio_file, format="audio/mp3", start_time=0) | |
| st.markdown("### Document Insights") | |
| if st.checkbox("Extract Keywords"): | |
| with st.spinner("Extracting keywords..."): | |
| keywords = extract_keywords(cleaned_text) | |
| st.write("Keywords:", ", ".join(keywords)) | |
| if st.checkbox("Analyze Sentiment"): | |
| with st.spinner("Analyzing sentiment..."): | |
| sentiment = analyze_sentiment(cleaned_text) | |
| st.write("Sentiment Analysis:", sentiment) | |