Spaces:

tejacherukuri
/

ProSpectAI

Running

App Files Files Community

TejaCherukuri commited on Jan 29

Commit

3d74a95

1 Parent(s): 6116b5a

prospectai code initial version

Browse files

Files changed (9) hide show

.gitignore +2 -0
app.py +68 -0
requirements.txt +6 -0
resources/resume.txt +95 -0
src/chat_model.py +43 -0
src/job_extractor.py +117 -0
src/message_writer.py +123 -0
src/resume_loader.py +166 -0
src/utils.py +50 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *__pycache__
2	+ *venv

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+from src.resume_loader import ResumeLoaderFactory
+from src.job_extractor import JobExtractor
+from src.message_writer import MessageWriter
+def main():
+    # Set the page layout to wide mode
+    st.set_page_config(page_title="ProSpectAI: The Smart Way to Reach Out to Recruiters", layout="wide")
+    # Title of the app
+    st.title("ProSpectAI: The Smart Way to Reach Out to Recruiters")
+    st.markdown("Tailored job application messages at the click of a button.")
+    # Resume Upload Section
+    st.subheader("Upload Your Resume")
+    uploaded_file = st.file_uploader("Upload a PDF Resume", type=["pdf"])
+    # Input field for the job URL
+    job_url = st.text_input(
+        "Enter the Job URL",
+        placeholder="https://amazon.jobs/en/jobs/2831138/software-development-engineer-2025-ai-ml"
+    )
+    # Button to trigger the flow
+    if st.button("Generate Message"):
+        if job_url:
+            st.info("Processing your request...")
+            # Trigger the flow (replace with your logic)
+            thought, response = generate_message_for_job(job_url, uploaded_file)
+            # Create two columns for displaying outputs side by side
+            col1, col2 = st.columns(2)
+            # Display Thought Process in the first column
+            with col1:
+                st.subheader("DeepThink")
+                st.text_area(" ", value=thought, height=500)
+            # Display Generated Message in the second column
+            with col2:
+                st.subheader("Generated Message")
+                st.text_area(" ", value=response, height=500)
+        else:
+            st.error("Please provide a valid job URL.")
+def generate_message_for_job(job_url, uploaded_file):
+    # Load the resume using the appropriate method (PDF or text)
+    if uploaded_file:
+        resume_loader = ResumeLoaderFactory.create_loader("pdf")
+        resume = resume_loader.load_resume(uploaded_file)
+    else:
+        resume_loader = ResumeLoaderFactory.create_loader("text")
+        resume = resume_loader.load_resume()
+    # Extract the key info from job URL
+    extractor = JobExtractor()
+    job = extractor.parse_job_from_web(job_url)
+    job = extractor.extract_jobdata(job)
+    # Invoke chat model
+    writer = MessageWriter()
+    thought, message = writer.write_message(job, resume)
+    return thought, message
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+langchain
+langchain-community
+langchain-groq
+streamlit
+bs4
+pypdf

resources/resume.txt ADDED Viewed

	@@ -0,0 +1,95 @@

+Name: Teja Krishna Cherukuri
+Phone: 470-662-7146
+Address: Atlanta, GA (Open to Relocation)
+Email: [email protected]
+Linkedin Url: https://linkedin.com/in/tejacherukuri
+Github Url: https://github.com/tejacherukuri
+Portfolio Url: https://tejacherukuri.github.io
+Google Scholar Url: https://scholar.google.com/citations?user=6S9WmqwAAAAJ&hl=en
+Education
+College: Georgia State University
+Location: Atlanta, GA
+Degree: Master of Science in Computer Science, GPA: 4.21/4.3
+Duration: Aug 2023 – May 2025
+Coursework: Deep Learning, Advanced Machine Learning, Computer Vision, Natural Language Processing, Digital
+Image Processing, Computational Intelligence, Data Science
+Technical Skills
+Languages: Python, Java, SQL
+Frameworks: PyTorch, TensorFlow, Keras, Flask, FastAPI, LangChain, Streamlit
+Libraries: NumPy, Pandas, Scikit-Learn, Matplotlib, OpenCV, NLTK
+Cloud & DevOps Tools: Git, Docker, Azure ML Studio, Azure AI Services
+Work Experience
+Role: Graduate Research Assistant
+Company: Georgia State University (TReNDS Lab)
+Duration: Sep 2023 – Present
+Location: Atlanta, GA
+Responsibilities or duties:
+• Published 5 IEEE papers showcasing the impact of our research methods for advancing AI in Medicine.
+• Developed and fine-tuned Multi-modal LLMs for medical image captioning using PyTorch, integrating images
+with diagnostic text, achieving a 13.4% higher BLEU4 over VisionGPT, with just 440M parameters, and
+reducing inference time to 1.6 sec per image.
+• Designed various medical image classification models for diagnosing chronic diseases such as schizophrenia, diabetic
+retinopathy, breast cancer, and colon cancer using specific imaging modalities with 5%–7% lower false negatives.
+• Enabled high-performance computing for training deep learning models through Slurm job scheduling,
+optimizing resource allocation and accelerating processing times.
+---Below are more details about the work I did, in the GRA role, can be used more in the context of research positions if needed---
+More details starts here
+Note: The below is just a background info of my works, you can only use this when there is absolute need of drafting to detail and stuff.
+• Medical Vision Language Transformer: Pioneered a novel approach for resource-constrained environments, integrating Abstractor & Adaptor to enhance feature focus and fusion, achieving expert-level precision in medical image captioning.
+• Multi-Modal Medical Transformer: Devised a vision-language model integrating retinal image features & clinical keywords, achieving a 13.5% improvement in BLEU-4 over GPT-2 for accurate diagnostic report generation and improving explainability by visualizing attention to diseased regions.
+• Guided Context Gating: Innovated a novel attention model to improve context learning in retinal images, boosting accuracy by 2.63% over advanced attention methods & 6.53% over Vision Transformer, enhancing retinopathy diagnosis.
+• Spatial Sequence Attention Network: Formulated a unique attention mechanism to identify Schizophrenia specific regions in brain sMRI, improving diagnosis accuracy by 6.52% and clinical interpretability with neuroanatomical insights.
+• Multi-Modal Imaging Genomics Transformer: Designed a fusion model combining genomics with sMRI & fMRI, bettering Schizophrenia diagnosis accuracy by 2.12% and revealing associated genetic markers.
+More details ends here
+Role: Data Scientist
+Company: Tata Consultancy Services Limited
+Duration: Nov 2020 – Aug 2023
+Location: Hyderabad, TS
+Responsibilities or duties:
+• Built a customer attrition system based on ensemble of SVM, Random Forest, and AdaBoost in Python using
+scikit-learn, improving 42 basis points in annual customer retention.
+• Led a POC for a dynamic risk-based pricing model, aligning interest rates with borrower risk profiles and market
+conditions, which reduced underpriced loans by 18%, and generated $3M in annual revenue growth.
+• Implemented REST APIs using FastAPI to surface machine learning models for loan approval and fraud
+detection, reducing workflows processing time by 30% and preventing potential fraud losses of $25M, annually.
+• Developed and deployed a chatbot using Azure AI Bot Service for handling customer queries in collaboration
+with the Customer Experience & Personalization team, achieving a 96% CSI and a 3.5 FTE reduction.
+• Achieved sub-100ms response times for high-volume inference requests by containerizing models with Docker and
+deploying them on GPU-enabled Azure Container Instances.
+Research Experience and Accomplishments
+• Published 7 research papers in journals, with 280+ citations and 5 H-index, pioneering Attention models,
+Multi-modal learning, Transformers and Large Vision Language Models.
+• Presented 5 works at reputed conferences, including ISBI 2024, ICIP 2024, ISBI 2025, and ICASSP 2025.
+Projects
+Name: RetinAI Doctor
+Link: https://github.com/TejaCherukuri/Guided-Context-Gating
+Demo: https://huggingface.co/spaces/tejacherukuri/Guided-Context-Gating
+Technologies used: Python, Streamlit, TensorFlow, OpenCV, Git
+Duration: Feb 2024 - Jun 2024
+• Built an AI tool to process retinal scans, predict diabetic retinopathy (DR) severity, and achieved 90.13% accuracy
+and recall using a novel Guided Context Gating (GCG) attention mechanism.
+• Enhanced interpretability by generating attention maps that highlight areas of focus, empowering ophthalmologists
+with insights for early and reliable DR diagnosis.
+Additional details about my resume:
+Job Interests: Data Scientist, Applied Scientist, Research Scientist, AI Engineer, Machine Learning Engineer, Deep Learning Engineer, Research Engineer
+Open to any location to work within the U.S (Comfortable with all modes of working - hybrid, onsite and remote)
+I can only apply to jobs with (university grad roles, early career postings, associate level postings, 3 years of experienced roles)
+I have end-to-end machine learning project experience, from requirement gathering, model development, model optimisation, model evaluation and model deployment.
+Additional Skills:
+Machine Learning, Linear Regression, Logistic Regression, Classification, PCA, Ensembling.
+Deep Learning: Neural Networks, Convolutional Neural Networks, Recurrent Neural Networks, LSTMs, Transformers, LLMs, MLLMs, Vision Language Models, Generative AI, VAE.
+Certification: Deep Learning Specialisation from DeepLearning.AI, Python for Everybody certification from University of Michigan

src/chat_model.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from langchain_groq import ChatGroq
+import os
+class ChatModel:
+    """
+    A wrapper class around the `ChatGroq` model, allowing interaction with the Groq AI model for generating responses.
+    Attributes:
+    -----------
+    groq : ChatGroq
+        The instance of the `ChatGroq` class used for generating responses from the Groq model.
+        The model is initialized with specific configuration parameters like temperature, API key, and model type.
+    """
+    def __init__(self):
+        """
+        Initializes the ChatModel class and sets up the ChatGroq instance for communication with the Groq model.
+        The constructor sets up the model configuration, including:
+        - `temperature`: Controls the randomness of the model's responses. Lower values (e.g., 0) make the output more deterministic.
+        - `api_key`: The API key required to authenticate requests to the Groq model, fetched from the environment variables.
+        - `model`: The specific Groq model to use. In this case, it uses the "deepseek-r1-distill-llama-70b" model.
+        The API key is fetched securely from the environment variables, ensuring that sensitive information is not hardcoded.
+        Raises:
+        -------
+        EnvironmentError:
+            If the API key is not set in the environment variables, an exception will be raised.
+        """
+        api_key = os.getenv("GROQ_API_KEY")
+        # Raise an error if the API key is not found in the environment
+        if not api_key:
+            raise EnvironmentError("GROQ_API_KEY environment variable not set.")
+        # Initialize the Groq model with the given configuration
+        self.groq = ChatGroq(
+            temperature=0,
+            api_key=api_key,
+            model="deepseek-r1-distill-llama-70b"
+        )

src/job_extractor.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from src.chat_model import ChatModel
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.exceptions import OutputParserException
+from src.utils import clean_text
+class JobExtractor:
+    """
+    A class responsible for extracting job posting details from a given job listing URL. The class uses
+    a prompt-based approach to process scraped text and extract relevant job details.
+    Attributes:
+    -----------
+    chat_model : ChatModel
+        An instance of the ChatModel to handle processing and extraction.
+    extract_prompt : PromptTemplate
+        The template used to instruct the model on how to process the scraped text.
+    json_parser : JsonOutputParser
+        The output parser to convert model responses into structured JSON format.
+    Methods:
+    --------
+    parse_job_from_web(url: str) -> str:
+        Scrapes and cleans the content from a given job listing URL.
+    extract_jobdata(text: str) -> dict:
+        Extracts and parses the job data from the cleaned text into a structured JSON format.
+    """
+    def __init__(self):
+        """
+        Initializes the JobExtractor instance with the necessary models, prompt templates,
+        and output parsers.
+        """
+        self.chat_model = ChatModel()
+        # Define the template to extract job data using the language model
+        self.extract_prompt = PromptTemplate.from_template(
+            """
+            ### SCRAPED TEXT FROM WEBSITE:
+            {page_data}
+            ### INSTRUCTION:
+            The scraped text is from the career's page of a website.
+            Your job is to extract the job postings and return them in JSON format containing the following keys:
+            `role`, `experience`, `skills`, `responsibilities`, `basic qualifications`,
+            `preferred qualifications`, and `description`.
+            Only return the valid JSON.
+            ### VALID JSON (NO PREAMBLE):
+            """
+        )
+        self.json_parser = JsonOutputParser()
+    def parse_job_from_web(self, url):
+        """
+        Scrapes and cleans the content from a given job listing URL.
+        Parameters:
+        -----------
+        url : str
+            The URL of the job listing page.
+        Returns:
+        --------
+        str:
+            The cleaned text content extracted from the job listing page.
+        Raises:
+        -------
+        ValueError: If the content could not be loaded or cleaned properly.
+        """
+        try:
+            loader = WebBaseLoader(url)
+            page_data = loader.load().pop().page_content
+            if not page_data:
+                raise ValueError("The scraped page content is empty.")
+            cleaned_data = clean_text(page_data)
+            print(f"Scraped and cleaned data: {cleaned_data[:200]}...")  # Displaying a snippet of data for debugging
+            return cleaned_data
+        except Exception as e:
+            raise ValueError(f"Error scraping or cleaning the content from the URL {url}: {e}")
+    def extract_jobdata(self, text):
+        """
+        Extracts and parses the job data from the cleaned text into a structured JSON format.
+        Parameters:
+        -----------
+        text : str
+            The cleaned text content from the job listing page.
+        Returns:
+        --------
+        dict:
+            A dictionary containing the extracted job information in JSON format.
+        Raises:
+        -------
+        OutputParserException: If the extracted response cannot be parsed as valid JSON.
+        ValueError: If the extraction process fails.
+        """
+        try:
+            extract_chain = self.extract_prompt | self.chat_model.groq
+            res = extract_chain.invoke(input={"page_data": text})
+            # Try parsing the response content into JSON format
+            job_data = self.json_parser.parse(res.content)
+            print("=====================JSON Job Data==================")
+            print(job_data)
+            return job_data
+        except OutputParserException as e:
+            raise OutputParserException("Unable to parse job data as valid JSON. The response might be malformed or incomplete.") from e
+        except Exception as e:
+            raise ValueError(f"An error occurred during job extraction: {e}") from e

src/message_writer.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from src.chat_model import ChatModel
+from langchain_core.prompts import PromptTemplate
+import re
+class MessageWriter:
+    """
+    A class that generates personalized email messages for recruiters based on job descriptions and resumes.
+    The class utilizes a prompt-based approach to generate the email content in a natural, casual tone,
+    while focusing on the alignment between the job requirements and the applicant's skills and experiences.
+    Attributes:
+    -----------
+    chat_model : ChatModel
+        An instance of the ChatModel used to process the job description and resume, and generate the email content.
+    message_prompt : PromptTemplate
+        The template used to instruct the model on how to structure the email content based on the job and resume details.
+    Methods:
+    --------
+    write_message(job: str, resume: str) -> tuple:
+        Generates the email message content by processing the job description and resume through the prompt chain,
+        and returns both the extracted thought process and cleaned email content.
+    """
+    def __init__(self):
+        """
+        Initializes the MessageWriter instance with the necessary models and prompt template for email generation.
+        """
+        self.chat_model = ChatModel()
+        # Define the prompt template for generating recruiter emails
+        self.message_prompt = PromptTemplate.from_template(
+            """
+            ### JOB DESCRIPTION:
+            {job_description}
+            ### YOUR RESUME
+            {resume}
+            ### INSTRUCTION:
+            You are a very helpful AI agent.
+            Your job is to write an email to the recruiter regarding the job mentioned above describing the capability of your work,
+            skills and experience (seen in resume above) in fulfilling their needs.
+            Follow the instructions line by line
+            At each line, stop, read and reason.
+            Finally, consolidate all together to a final email.
+            Your instructions start now
+            1. You should sound very natural
+            2. Use the information only from your resume provided above. DO NOT HALLUCINATE
+            3. Think from the recuiter perspective and what he loves to see.
+            4. Identify the top responsibilities, skills and qualifications from the job description.
+            5. Identify the skills, relevant work experience and research experience points from resume that matches the extracted details of job.
+            6. Once identified, write how does these skills and experience fulfill the responsibilities for the new role. Do not be generic, rather write what specific experiences can prove your claim. Use quantification when necessary, from your work experience points.
+            7. Highlight how you will fit in to their team with the qualifications you possess. Make it sound natural.
+            8. AVOID, "I am so excited to apply", "my skills align well".
+            9. AVOID, "Thank you for considering my application, I look forward to discussing how my skills align with the organizational goals". Instead, write the same information in a creative way.
+            10. DO NOT make it too professional, do it in a casual tone.
+            11. Make it CONCISE and the limit to 8-10 lines. Divide into paragraphs.
+            12. Embed portfolio and other necessary links within the email where necessary. Add full link, not a placeholder.
+            13. Signature should be "Best\n, Name of the person from the resume". No links, here.
+            14. AVOID PREAMBLE. For Example: AVOID, "Here's an email, etc" at the start of generation.
+            ### EMAIL:
+            """
+        )
+    def write_message(self, job, resume):
+        """
+        Generates a personalized email message for the recruiter based on the provided job description and resume.
+        Parameters:
+        -----------
+        job : str
+            The job description from the job listing.
+        resume : str
+            The resume content of the applicant.
+        Returns:
+        --------
+        tuple:
+            A tuple containing:
+            - thought_process (str): The reasoning and thought process of the model.
+            - cleaned_response (str): The final generated email content, cleaned of any extra elements.
+        Raises:
+        -------
+        ValueError: If there is an error in invoking the model chain or processing the response.
+        """
+        try:
+            # Create the chain of prompt and model invocation
+            message_chain = self.message_prompt | self.chat_model.groq
+            # Invoke the model to generate the email content
+            res = message_chain.invoke(input={"job_description": job, "resume": resume})
+            # Extract the thought process (if any) enclosed in <think> tags
+            think_content = re.findall(r'<think>(.*?)</think>', res.content, flags=re.DOTALL)
+            cleaned_response = re.sub(r'<think>.*?</think>', '', res.content, flags=re.DOTALL)
+            # Check if content was found
+            if think_content:
+                # Get the first element from the list (since re.findall returns a list)
+                extracted_text = think_content[0]
+                extracted_text = extracted_text.strip()  # Strip leading/trailing whitespace and newlines
+                # Print the well-formatted text
+                print("======Thought Process======")
+                print(extracted_text)
+                think_content = extracted_text
+            else:
+                print("No content found between <think> and </think> tags.")
+            print("======Cleaned Response======")
+            print(cleaned_response)
+            # Return the extracted thought process and the cleaned email content
+            return think_content, cleaned_response.strip()
+        except Exception as e:
+            # Raise a ValueError with additional context if there was an error in processing
+            raise ValueError(f"An error occurred while generating the email: {e}") from e

src/resume_loader.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+import tempfile
+from abc import ABC, abstractmethod
+from langchain_community.document_loaders import TextLoader, PyPDFLoader
+class ResumeLoader(ABC):
+    """
+    Abstract Base Class for loading resumes. All resume loader classes (TextResumeLoader, PdfResumeLoader)
+    should inherit from this class and implement the `load_resume` method.
+    Methods:
+    --------
+    load_resume() -> object:
+        Abstract method to load a resume. Must be implemented by subclasses.
+    """
+    @abstractmethod
+    def load_resume(self):
+        """
+        Abstract method to load a resume. Must be implemented by subclasses.
+        Returns:
+        --------
+        object:
+            The content of the resume as an object.
+        """
+        pass
+class TextResumeLoader(ResumeLoader):
+    """
+    A class to load resumes from a text file.
+    Methods:
+    --------
+    load_resume() -> object:
+        Loads the resume from a predefined text file located in the "resources" directory.
+        Raises a FileNotFoundError if the file is not found.
+    Raises:
+    -------
+    FileNotFoundError: If the predefined resume text file is not found.
+    """
+    def __init__(self):
+        """
+        Initializes the TextResumeLoader instance and sets the path to the resume text file.
+        """
+        self.current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        self.file_path = os.path.join(self.current_dir, "resources", "resume.txt")
+    def load_resume(self):
+        """
+        Loads the resume from a predefined text file.
+        Returns:
+        --------
+        object:
+            The resume content as an object containing the text.
+        Raises:
+        -------
+        FileNotFoundError:
+            If the resume text file cannot be found at the specified path.
+        """
+        if not os.path.exists(self.file_path):
+            raise FileNotFoundError(f"File {self.file_path} does not exist. Please check the path.")
+        text_loader = TextLoader(self.file_path)
+        resume = text_loader.load()  # Directly load the full text without chunking
+        print(resume[0].page_content)
+        return resume[0]
+class PdfResumeLoader(ResumeLoader):
+    """
+    A class to load resumes from PDF files.
+    Methods:
+    --------
+    load_resume_pdf(file) -> object:
+        Loads a resume from an uploaded PDF file, saving it temporarily before processing.
+        Cleans up the temporary file after processing.
+    Raises:
+    -------
+    Exception:
+        If an error occurs during the loading or extraction of the PDF content.
+    """
+    def load_resume(self, file=None):
+        """
+        Loads the resume from an uploaded PDF file by saving it as a temporary file and processing it.
+        Parameters:
+        -----------
+        file : file-like object
+            The uploaded PDF file to be processed.
+        Returns:
+        --------
+        object:
+            The resume content as an object extracted from the PDF file.
+        Raises:
+        -------
+        Exception:
+            If an error occurs during the loading or extraction of the PDF content.
+        """
+        if file is None:
+            raise ValueError("PDF file must be provided for PdfResumeLoader.")
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(file.getvalue())  # Save uploaded file
+            temp_file_path = temp_file.name  # Get file path
+        try:
+            # Load PDF using the temporary file path
+            pdf_loader = PyPDFLoader(temp_file_path)
+            resume = pdf_loader.load()  # Extract text from PDF
+            print(resume[0].page_content)  # Debug: Print first page content
+            return resume[0]
+        except Exception as e:
+            raise Exception(f"Error loading PDF: {e}")
+        finally:
+            # Ensure the file is deleted after processing
+            os.remove(temp_file_path)
+class ResumeLoaderFactory:
+    """
+    A Factory class responsible for creating appropriate ResumeLoader instances based on the input type (text or PDF).
+    Methods:
+    --------
+    create_loader(file_type: str) -> ResumeLoader:
+        Returns an instance of ResumeLoader based on the specified file type (text or pdf).
+    """
+    @staticmethod
+    def create_loader(file_type: str) -> ResumeLoader:
+        """
+        Creates a ResumeLoader instance based on the file type provided.
+        Parameters:
+        -----------
+        file_type : str
+            The type of file to be processed ("text" or "pdf").
+        Returns:
+        --------
+        ResumeLoader:
+            The appropriate ResumeLoader instance for the specified file type.
+        Raises:
+        -------
+        ValueError:
+            If the file type is not recognized (must be either "text" or "pdf").
+        """
+        if file_type == "text":
+            return TextResumeLoader()
+        elif file_type == "pdf":
+            return PdfResumeLoader()
+        else:
+            raise ValueError("Invalid file type. Accepted values are 'text' or 'pdf'.")

src/utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import re
+def clean_text(text: str) -> str:
+    """
+    Cleans and preprocesses the input text by removing unwanted elements such as HTML tags, URLs,
+    special characters, and extra whitespace. This function is useful for preparing text data for
+    further processing or analysis.
+    Parameters:
+    -----------
+    text : str
+        The input text to be cleaned. This text may contain HTML tags, URLs, special characters,
+        multiple spaces, and unnecessary whitespace.
+    Returns:
+    --------
+    str
+        A cleaned version of the input text with the following modifications:
+        - HTML tags removed
+        - URLs removed
+        - Special characters (other than letters, digits, and spaces) removed
+        - Multiple consecutive spaces replaced with a single space
+        - Leading and trailing whitespace removed
+        - Extra spaces between words reduced to a single space
+    Example:
+    --------
+    >>> clean_text("<p>Hello <b>World</b>! Visit http://example.com for more info.</p>")
+    'Hello World Visit for more info'
+    """
+    # Remove HTML tags
+    text = re.sub(r'<[^>]*?>', '', text)
+    # Remove URLs
+    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+    # Remove special characters (anything that is not a letter, number, or space)
+    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
+    # Replace multiple spaces with a single space
+    text = re.sub(r'\s{2,}', ' ', text)
+    # Trim leading and trailing whitespace
+    text = text.strip()
+    # Remove extra whitespace between words (in case of multiple spaces)
+    text = ' '.join(text.split())
+    return text