TejaCherukuri commited on
Commit
3d74a95
·
1 Parent(s): 6116b5a

prospectai code initial version

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *__pycache__
2
+ *venv
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.resume_loader import ResumeLoaderFactory
3
+ from src.job_extractor import JobExtractor
4
+ from src.message_writer import MessageWriter
5
+
6
+ def main():
7
+ # Set the page layout to wide mode
8
+ st.set_page_config(page_title="ProSpectAI: The Smart Way to Reach Out to Recruiters", layout="wide")
9
+
10
+ # Title of the app
11
+ st.title("ProSpectAI: The Smart Way to Reach Out to Recruiters")
12
+ st.markdown("Tailored job application messages at the click of a button.")
13
+
14
+ # Resume Upload Section
15
+ st.subheader("Upload Your Resume")
16
+ uploaded_file = st.file_uploader("Upload a PDF Resume", type=["pdf"])
17
+
18
+ # Input field for the job URL
19
+ job_url = st.text_input(
20
+ "Enter the Job URL",
21
+ placeholder="https://amazon.jobs/en/jobs/2831138/software-development-engineer-2025-ai-ml"
22
+ )
23
+
24
+ # Button to trigger the flow
25
+ if st.button("Generate Message"):
26
+ if job_url:
27
+ st.info("Processing your request...")
28
+ # Trigger the flow (replace with your logic)
29
+ thought, response = generate_message_for_job(job_url, uploaded_file)
30
+
31
+ # Create two columns for displaying outputs side by side
32
+ col1, col2 = st.columns(2)
33
+
34
+ # Display Thought Process in the first column
35
+ with col1:
36
+ st.subheader("DeepThink")
37
+ st.text_area(" ", value=thought, height=500)
38
+
39
+ # Display Generated Message in the second column
40
+ with col2:
41
+ st.subheader("Generated Message")
42
+ st.text_area(" ", value=response, height=500)
43
+ else:
44
+ st.error("Please provide a valid job URL.")
45
+
46
+ def generate_message_for_job(job_url, uploaded_file):
47
+
48
+ # Load the resume using the appropriate method (PDF or text)
49
+ if uploaded_file:
50
+ resume_loader = ResumeLoaderFactory.create_loader("pdf")
51
+ resume = resume_loader.load_resume(uploaded_file)
52
+ else:
53
+ resume_loader = ResumeLoaderFactory.create_loader("text")
54
+ resume = resume_loader.load_resume()
55
+
56
+ # Extract the key info from job URL
57
+ extractor = JobExtractor()
58
+ job = extractor.parse_job_from_web(job_url)
59
+ job = extractor.extract_jobdata(job)
60
+
61
+ # Invoke chat model
62
+ writer = MessageWriter()
63
+ thought, message = writer.write_message(job, resume)
64
+
65
+ return thought, message
66
+
67
+ if __name__ == "__main__":
68
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchain-groq
4
+ streamlit
5
+ bs4
6
+ pypdf
resources/resume.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name: Teja Krishna Cherukuri
2
+ Phone: 470-662-7146
3
+ Address: Atlanta, GA (Open to Relocation)
4
5
+ Linkedin Url: https://linkedin.com/in/tejacherukuri
6
+ Github Url: https://github.com/tejacherukuri
7
+ Portfolio Url: https://tejacherukuri.github.io
8
+ Google Scholar Url: https://scholar.google.com/citations?user=6S9WmqwAAAAJ&hl=en
9
+
10
+ Education
11
+ College: Georgia State University
12
+ Location: Atlanta, GA
13
+ Degree: Master of Science in Computer Science, GPA: 4.21/4.3
14
+ Duration: Aug 2023 – May 2025
15
+ Coursework: Deep Learning, Advanced Machine Learning, Computer Vision, Natural Language Processing, Digital
16
+ Image Processing, Computational Intelligence, Data Science
17
+
18
+ Technical Skills
19
+ Languages: Python, Java, SQL
20
+ Frameworks: PyTorch, TensorFlow, Keras, Flask, FastAPI, LangChain, Streamlit
21
+ Libraries: NumPy, Pandas, Scikit-Learn, Matplotlib, OpenCV, NLTK
22
+ Cloud & DevOps Tools: Git, Docker, Azure ML Studio, Azure AI Services
23
+
24
+ Work Experience
25
+ Role: Graduate Research Assistant
26
+ Company: Georgia State University (TReNDS Lab)
27
+ Duration: Sep 2023 – Present
28
+ Location: Atlanta, GA
29
+
30
+ Responsibilities or duties:
31
+ • Published 5 IEEE papers showcasing the impact of our research methods for advancing AI in Medicine.
32
+ • Developed and fine-tuned Multi-modal LLMs for medical image captioning using PyTorch, integrating images
33
+ with diagnostic text, achieving a 13.4% higher BLEU4 over VisionGPT, with just 440M parameters, and
34
+ reducing inference time to 1.6 sec per image.
35
+ • Designed various medical image classification models for diagnosing chronic diseases such as schizophrenia, diabetic
36
+ retinopathy, breast cancer, and colon cancer using specific imaging modalities with 5%–7% lower false negatives.
37
+ • Enabled high-performance computing for training deep learning models through Slurm job scheduling,
38
+ optimizing resource allocation and accelerating processing times.
39
+
40
+ ---Below are more details about the work I did, in the GRA role, can be used more in the context of research positions if needed---
41
+ More details starts here
42
+ Note: The below is just a background info of my works, you can only use this when there is absolute need of drafting to detail and stuff.
43
+ • Medical Vision Language Transformer: Pioneered a novel approach for resource-constrained environments, integrating Abstractor & Adaptor to enhance feature focus and fusion, achieving expert-level precision in medical image captioning.
44
+ • Multi-Modal Medical Transformer: Devised a vision-language model integrating retinal image features & clinical keywords, achieving a 13.5% improvement in BLEU-4 over GPT-2 for accurate diagnostic report generation and improving explainability by visualizing attention to diseased regions.
45
+ • Guided Context Gating: Innovated a novel attention model to improve context learning in retinal images, boosting accuracy by 2.63% over advanced attention methods & 6.53% over Vision Transformer, enhancing retinopathy diagnosis.
46
+ • Spatial Sequence Attention Network: Formulated a unique attention mechanism to identify Schizophrenia specific regions in brain sMRI, improving diagnosis accuracy by 6.52% and clinical interpretability with neuroanatomical insights.
47
+ • Multi-Modal Imaging Genomics Transformer: Designed a fusion model combining genomics with sMRI & fMRI, bettering Schizophrenia diagnosis accuracy by 2.12% and revealing associated genetic markers.
48
+ More details ends here
49
+
50
+ Role: Data Scientist
51
+ Company: Tata Consultancy Services Limited
52
+ Duration: Nov 2020 – Aug 2023
53
+ Location: Hyderabad, TS
54
+
55
+ Responsibilities or duties:
56
+ • Built a customer attrition system based on ensemble of SVM, Random Forest, and AdaBoost in Python using
57
+ scikit-learn, improving 42 basis points in annual customer retention.
58
+ • Led a POC for a dynamic risk-based pricing model, aligning interest rates with borrower risk profiles and market
59
+ conditions, which reduced underpriced loans by 18%, and generated $3M in annual revenue growth.
60
+ • Implemented REST APIs using FastAPI to surface machine learning models for loan approval and fraud
61
+ detection, reducing workflows processing time by 30% and preventing potential fraud losses of $25M, annually.
62
+ • Developed and deployed a chatbot using Azure AI Bot Service for handling customer queries in collaboration
63
+ with the Customer Experience & Personalization team, achieving a 96% CSI and a 3.5 FTE reduction.
64
+ • Achieved sub-100ms response times for high-volume inference requests by containerizing models with Docker and
65
+ deploying them on GPU-enabled Azure Container Instances.
66
+
67
+ Research Experience and Accomplishments
68
+ • Published 7 research papers in journals, with 280+ citations and 5 H-index, pioneering Attention models,
69
+ Multi-modal learning, Transformers and Large Vision Language Models.
70
+ • Presented 5 works at reputed conferences, including ISBI 2024, ICIP 2024, ISBI 2025, and ICASSP 2025.
71
+
72
+ Projects
73
+ Name: RetinAI Doctor
74
+ Link: https://github.com/TejaCherukuri/Guided-Context-Gating
75
+ Demo: https://huggingface.co/spaces/tejacherukuri/Guided-Context-Gating
76
+ Technologies used: Python, Streamlit, TensorFlow, OpenCV, Git
77
+ Duration: Feb 2024 - Jun 2024
78
+ • Built an AI tool to process retinal scans, predict diabetic retinopathy (DR) severity, and achieved 90.13% accuracy
79
+ and recall using a novel Guided Context Gating (GCG) attention mechanism.
80
+ • Enhanced interpretability by generating attention maps that highlight areas of focus, empowering ophthalmologists
81
+ with insights for early and reliable DR diagnosis.
82
+
83
+
84
+ Additional details about my resume:
85
+ Job Interests: Data Scientist, Applied Scientist, Research Scientist, AI Engineer, Machine Learning Engineer, Deep Learning Engineer, Research Engineer
86
+ Open to any location to work within the U.S (Comfortable with all modes of working - hybrid, onsite and remote)
87
+ I can only apply to jobs with (university grad roles, early career postings, associate level postings, 3 years of experienced roles)
88
+ I have end-to-end machine learning project experience, from requirement gathering, model development, model optimisation, model evaluation and model deployment.
89
+ Additional Skills:
90
+ Machine Learning, Linear Regression, Logistic Regression, Classification, PCA, Ensembling.
91
+ Deep Learning: Neural Networks, Convolutional Neural Networks, Recurrent Neural Networks, LSTMs, Transformers, LLMs, MLLMs, Vision Language Models, Generative AI, VAE.
92
+ Certification: Deep Learning Specialisation from DeepLearning.AI, Python for Everybody certification from University of Michigan
93
+
94
+
95
+
src/chat_model.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ import os
3
+
4
+ class ChatModel:
5
+ """
6
+ A wrapper class around the `ChatGroq` model, allowing interaction with the Groq AI model for generating responses.
7
+
8
+ Attributes:
9
+ -----------
10
+ groq : ChatGroq
11
+ The instance of the `ChatGroq` class used for generating responses from the Groq model.
12
+ The model is initialized with specific configuration parameters like temperature, API key, and model type.
13
+ """
14
+
15
+ def __init__(self):
16
+ """
17
+ Initializes the ChatModel class and sets up the ChatGroq instance for communication with the Groq model.
18
+
19
+ The constructor sets up the model configuration, including:
20
+ - `temperature`: Controls the randomness of the model's responses. Lower values (e.g., 0) make the output more deterministic.
21
+ - `api_key`: The API key required to authenticate requests to the Groq model, fetched from the environment variables.
22
+ - `model`: The specific Groq model to use. In this case, it uses the "deepseek-r1-distill-llama-70b" model.
23
+
24
+ The API key is fetched securely from the environment variables, ensuring that sensitive information is not hardcoded.
25
+
26
+ Raises:
27
+ -------
28
+ EnvironmentError:
29
+ If the API key is not set in the environment variables, an exception will be raised.
30
+ """
31
+
32
+ api_key = os.getenv("GROQ_API_KEY")
33
+
34
+ # Raise an error if the API key is not found in the environment
35
+ if not api_key:
36
+ raise EnvironmentError("GROQ_API_KEY environment variable not set.")
37
+
38
+ # Initialize the Groq model with the given configuration
39
+ self.groq = ChatGroq(
40
+ temperature=0,
41
+ api_key=api_key,
42
+ model="deepseek-r1-distill-llama-70b"
43
+ )
src/job_extractor.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.chat_model import ChatModel
2
+ from langchain_community.document_loaders import WebBaseLoader
3
+ from langchain_core.prompts import PromptTemplate
4
+ from langchain_core.output_parsers import JsonOutputParser
5
+ from langchain_core.exceptions import OutputParserException
6
+ from src.utils import clean_text
7
+
8
+ class JobExtractor:
9
+ """
10
+ A class responsible for extracting job posting details from a given job listing URL. The class uses
11
+ a prompt-based approach to process scraped text and extract relevant job details.
12
+
13
+ Attributes:
14
+ -----------
15
+ chat_model : ChatModel
16
+ An instance of the ChatModel to handle processing and extraction.
17
+ extract_prompt : PromptTemplate
18
+ The template used to instruct the model on how to process the scraped text.
19
+ json_parser : JsonOutputParser
20
+ The output parser to convert model responses into structured JSON format.
21
+
22
+ Methods:
23
+ --------
24
+ parse_job_from_web(url: str) -> str:
25
+ Scrapes and cleans the content from a given job listing URL.
26
+
27
+ extract_jobdata(text: str) -> dict:
28
+ Extracts and parses the job data from the cleaned text into a structured JSON format.
29
+ """
30
+
31
+ def __init__(self):
32
+ """
33
+ Initializes the JobExtractor instance with the necessary models, prompt templates,
34
+ and output parsers.
35
+ """
36
+ self.chat_model = ChatModel()
37
+
38
+ # Define the template to extract job data using the language model
39
+ self.extract_prompt = PromptTemplate.from_template(
40
+ """
41
+ ### SCRAPED TEXT FROM WEBSITE:
42
+ {page_data}
43
+ ### INSTRUCTION:
44
+ The scraped text is from the career's page of a website.
45
+ Your job is to extract the job postings and return them in JSON format containing the following keys:
46
+ `role`, `experience`, `skills`, `responsibilities`, `basic qualifications`,
47
+ `preferred qualifications`, and `description`.
48
+ Only return the valid JSON.
49
+ ### VALID JSON (NO PREAMBLE):
50
+ """
51
+ )
52
+
53
+ self.json_parser = JsonOutputParser()
54
+
55
+ def parse_job_from_web(self, url):
56
+ """
57
+ Scrapes and cleans the content from a given job listing URL.
58
+
59
+ Parameters:
60
+ -----------
61
+ url : str
62
+ The URL of the job listing page.
63
+
64
+ Returns:
65
+ --------
66
+ str:
67
+ The cleaned text content extracted from the job listing page.
68
+
69
+ Raises:
70
+ -------
71
+ ValueError: If the content could not be loaded or cleaned properly.
72
+ """
73
+ try:
74
+ loader = WebBaseLoader(url)
75
+ page_data = loader.load().pop().page_content
76
+ if not page_data:
77
+ raise ValueError("The scraped page content is empty.")
78
+ cleaned_data = clean_text(page_data)
79
+ print(f"Scraped and cleaned data: {cleaned_data[:200]}...") # Displaying a snippet of data for debugging
80
+ return cleaned_data
81
+ except Exception as e:
82
+ raise ValueError(f"Error scraping or cleaning the content from the URL {url}: {e}")
83
+
84
+ def extract_jobdata(self, text):
85
+ """
86
+ Extracts and parses the job data from the cleaned text into a structured JSON format.
87
+
88
+ Parameters:
89
+ -----------
90
+ text : str
91
+ The cleaned text content from the job listing page.
92
+
93
+ Returns:
94
+ --------
95
+ dict:
96
+ A dictionary containing the extracted job information in JSON format.
97
+
98
+ Raises:
99
+ -------
100
+ OutputParserException: If the extracted response cannot be parsed as valid JSON.
101
+ ValueError: If the extraction process fails.
102
+ """
103
+ try:
104
+ extract_chain = self.extract_prompt | self.chat_model.groq
105
+ res = extract_chain.invoke(input={"page_data": text})
106
+
107
+ # Try parsing the response content into JSON format
108
+ job_data = self.json_parser.parse(res.content)
109
+ print("=====================JSON Job Data==================")
110
+ print(job_data)
111
+ return job_data
112
+
113
+ except OutputParserException as e:
114
+ raise OutputParserException("Unable to parse job data as valid JSON. The response might be malformed or incomplete.") from e
115
+ except Exception as e:
116
+ raise ValueError(f"An error occurred during job extraction: {e}") from e
117
+
src/message_writer.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.chat_model import ChatModel
2
+ from langchain_core.prompts import PromptTemplate
3
+ import re
4
+
5
+ class MessageWriter:
6
+ """
7
+ A class that generates personalized email messages for recruiters based on job descriptions and resumes.
8
+ The class utilizes a prompt-based approach to generate the email content in a natural, casual tone,
9
+ while focusing on the alignment between the job requirements and the applicant's skills and experiences.
10
+
11
+ Attributes:
12
+ -----------
13
+ chat_model : ChatModel
14
+ An instance of the ChatModel used to process the job description and resume, and generate the email content.
15
+ message_prompt : PromptTemplate
16
+ The template used to instruct the model on how to structure the email content based on the job and resume details.
17
+
18
+ Methods:
19
+ --------
20
+ write_message(job: str, resume: str) -> tuple:
21
+ Generates the email message content by processing the job description and resume through the prompt chain,
22
+ and returns both the extracted thought process and cleaned email content.
23
+ """
24
+
25
+ def __init__(self):
26
+ """
27
+ Initializes the MessageWriter instance with the necessary models and prompt template for email generation.
28
+ """
29
+ self.chat_model = ChatModel()
30
+
31
+ # Define the prompt template for generating recruiter emails
32
+ self.message_prompt = PromptTemplate.from_template(
33
+ """
34
+ ### JOB DESCRIPTION:
35
+ {job_description}
36
+
37
+ ### YOUR RESUME
38
+ {resume}
39
+
40
+ ### INSTRUCTION:
41
+ You are a very helpful AI agent.
42
+ Your job is to write an email to the recruiter regarding the job mentioned above describing the capability of your work,
43
+ skills and experience (seen in resume above) in fulfilling their needs.
44
+
45
+ Follow the instructions line by line
46
+ At each line, stop, read and reason.
47
+ Finally, consolidate all together to a final email.
48
+
49
+ Your instructions start now
50
+ 1. You should sound very natural
51
+ 2. Use the information only from your resume provided above. DO NOT HALLUCINATE
52
+ 3. Think from the recuiter perspective and what he loves to see.
53
+ 4. Identify the top responsibilities, skills and qualifications from the job description.
54
+ 5. Identify the skills, relevant work experience and research experience points from resume that matches the extracted details of job.
55
+ 6. Once identified, write how does these skills and experience fulfill the responsibilities for the new role. Do not be generic, rather write what specific experiences can prove your claim. Use quantification when necessary, from your work experience points.
56
+ 7. Highlight how you will fit in to their team with the qualifications you possess. Make it sound natural.
57
+ 8. AVOID, "I am so excited to apply", "my skills align well".
58
+ 9. AVOID, "Thank you for considering my application, I look forward to discussing how my skills align with the organizational goals". Instead, write the same information in a creative way.
59
+ 10. DO NOT make it too professional, do it in a casual tone.
60
+ 11. Make it CONCISE and the limit to 8-10 lines. Divide into paragraphs.
61
+ 12. Embed portfolio and other necessary links within the email where necessary. Add full link, not a placeholder.
62
+ 13. Signature should be "Best\n, Name of the person from the resume". No links, here.
63
+ 14. AVOID PREAMBLE. For Example: AVOID, "Here's an email, etc" at the start of generation.
64
+
65
+ ### EMAIL:
66
+ """
67
+ )
68
+
69
+ def write_message(self, job, resume):
70
+ """
71
+ Generates a personalized email message for the recruiter based on the provided job description and resume.
72
+
73
+ Parameters:
74
+ -----------
75
+ job : str
76
+ The job description from the job listing.
77
+ resume : str
78
+ The resume content of the applicant.
79
+
80
+ Returns:
81
+ --------
82
+ tuple:
83
+ A tuple containing:
84
+ - thought_process (str): The reasoning and thought process of the model.
85
+ - cleaned_response (str): The final generated email content, cleaned of any extra elements.
86
+
87
+ Raises:
88
+ -------
89
+ ValueError: If there is an error in invoking the model chain or processing the response.
90
+ """
91
+ try:
92
+ # Create the chain of prompt and model invocation
93
+ message_chain = self.message_prompt | self.chat_model.groq
94
+
95
+ # Invoke the model to generate the email content
96
+ res = message_chain.invoke(input={"job_description": job, "resume": resume})
97
+
98
+ # Extract the thought process (if any) enclosed in <think> tags
99
+ think_content = re.findall(r'<think>(.*?)</think>', res.content, flags=re.DOTALL)
100
+ cleaned_response = re.sub(r'<think>.*?</think>', '', res.content, flags=re.DOTALL)
101
+
102
+ # Check if content was found
103
+ if think_content:
104
+ # Get the first element from the list (since re.findall returns a list)
105
+ extracted_text = think_content[0]
106
+ extracted_text = extracted_text.strip() # Strip leading/trailing whitespace and newlines
107
+
108
+ # Print the well-formatted text
109
+ print("======Thought Process======")
110
+ print(extracted_text)
111
+ think_content = extracted_text
112
+ else:
113
+ print("No content found between <think> and </think> tags.")
114
+
115
+ print("======Cleaned Response======")
116
+ print(cleaned_response)
117
+
118
+ # Return the extracted thought process and the cleaned email content
119
+ return think_content, cleaned_response.strip()
120
+
121
+ except Exception as e:
122
+ # Raise a ValueError with additional context if there was an error in processing
123
+ raise ValueError(f"An error occurred while generating the email: {e}") from e
src/resume_loader.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from abc import ABC, abstractmethod
4
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader
5
+
6
+ class ResumeLoader(ABC):
7
+ """
8
+ Abstract Base Class for loading resumes. All resume loader classes (TextResumeLoader, PdfResumeLoader)
9
+ should inherit from this class and implement the `load_resume` method.
10
+
11
+ Methods:
12
+ --------
13
+ load_resume() -> object:
14
+ Abstract method to load a resume. Must be implemented by subclasses.
15
+ """
16
+
17
+ @abstractmethod
18
+ def load_resume(self):
19
+ """
20
+ Abstract method to load a resume. Must be implemented by subclasses.
21
+
22
+ Returns:
23
+ --------
24
+ object:
25
+ The content of the resume as an object.
26
+ """
27
+ pass
28
+
29
+ class TextResumeLoader(ResumeLoader):
30
+ """
31
+ A class to load resumes from a text file.
32
+
33
+ Methods:
34
+ --------
35
+ load_resume() -> object:
36
+ Loads the resume from a predefined text file located in the "resources" directory.
37
+ Raises a FileNotFoundError if the file is not found.
38
+
39
+ Raises:
40
+ -------
41
+ FileNotFoundError: If the predefined resume text file is not found.
42
+ """
43
+
44
+ def __init__(self):
45
+ """
46
+ Initializes the TextResumeLoader instance and sets the path to the resume text file.
47
+ """
48
+ self.current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
49
+ self.file_path = os.path.join(self.current_dir, "resources", "resume.txt")
50
+
51
+ def load_resume(self):
52
+ """
53
+ Loads the resume from a predefined text file.
54
+
55
+ Returns:
56
+ --------
57
+ object:
58
+ The resume content as an object containing the text.
59
+
60
+ Raises:
61
+ -------
62
+ FileNotFoundError:
63
+ If the resume text file cannot be found at the specified path.
64
+ """
65
+ if not os.path.exists(self.file_path):
66
+ raise FileNotFoundError(f"File {self.file_path} does not exist. Please check the path.")
67
+
68
+ text_loader = TextLoader(self.file_path)
69
+ resume = text_loader.load() # Directly load the full text without chunking
70
+ print(resume[0].page_content)
71
+
72
+ return resume[0]
73
+
74
+ class PdfResumeLoader(ResumeLoader):
75
+ """
76
+ A class to load resumes from PDF files.
77
+
78
+ Methods:
79
+ --------
80
+ load_resume_pdf(file) -> object:
81
+ Loads a resume from an uploaded PDF file, saving it temporarily before processing.
82
+ Cleans up the temporary file after processing.
83
+
84
+ Raises:
85
+ -------
86
+ Exception:
87
+ If an error occurs during the loading or extraction of the PDF content.
88
+ """
89
+
90
+ def load_resume(self, file=None):
91
+ """
92
+ Loads the resume from an uploaded PDF file by saving it as a temporary file and processing it.
93
+
94
+ Parameters:
95
+ -----------
96
+ file : file-like object
97
+ The uploaded PDF file to be processed.
98
+
99
+ Returns:
100
+ --------
101
+ object:
102
+ The resume content as an object extracted from the PDF file.
103
+
104
+ Raises:
105
+ -------
106
+ Exception:
107
+ If an error occurs during the loading or extraction of the PDF content.
108
+ """
109
+ if file is None:
110
+ raise ValueError("PDF file must be provided for PdfResumeLoader.")
111
+
112
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
113
+ temp_file.write(file.getvalue()) # Save uploaded file
114
+ temp_file_path = temp_file.name # Get file path
115
+
116
+ try:
117
+ # Load PDF using the temporary file path
118
+ pdf_loader = PyPDFLoader(temp_file_path)
119
+ resume = pdf_loader.load() # Extract text from PDF
120
+
121
+ print(resume[0].page_content) # Debug: Print first page content
122
+ return resume[0]
123
+
124
+ except Exception as e:
125
+ raise Exception(f"Error loading PDF: {e}")
126
+
127
+ finally:
128
+ # Ensure the file is deleted after processing
129
+ os.remove(temp_file_path)
130
+
131
+ class ResumeLoaderFactory:
132
+ """
133
+ A Factory class responsible for creating appropriate ResumeLoader instances based on the input type (text or PDF).
134
+
135
+ Methods:
136
+ --------
137
+ create_loader(file_type: str) -> ResumeLoader:
138
+ Returns an instance of ResumeLoader based on the specified file type (text or pdf).
139
+ """
140
+
141
+ @staticmethod
142
+ def create_loader(file_type: str) -> ResumeLoader:
143
+ """
144
+ Creates a ResumeLoader instance based on the file type provided.
145
+
146
+ Parameters:
147
+ -----------
148
+ file_type : str
149
+ The type of file to be processed ("text" or "pdf").
150
+
151
+ Returns:
152
+ --------
153
+ ResumeLoader:
154
+ The appropriate ResumeLoader instance for the specified file type.
155
+
156
+ Raises:
157
+ -------
158
+ ValueError:
159
+ If the file type is not recognized (must be either "text" or "pdf").
160
+ """
161
+ if file_type == "text":
162
+ return TextResumeLoader()
163
+ elif file_type == "pdf":
164
+ return PdfResumeLoader()
165
+ else:
166
+ raise ValueError("Invalid file type. Accepted values are 'text' or 'pdf'.")
src/utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_text(text: str) -> str:
4
+ """
5
+ Cleans and preprocesses the input text by removing unwanted elements such as HTML tags, URLs,
6
+ special characters, and extra whitespace. This function is useful for preparing text data for
7
+ further processing or analysis.
8
+
9
+ Parameters:
10
+ -----------
11
+ text : str
12
+ The input text to be cleaned. This text may contain HTML tags, URLs, special characters,
13
+ multiple spaces, and unnecessary whitespace.
14
+
15
+ Returns:
16
+ --------
17
+ str
18
+ A cleaned version of the input text with the following modifications:
19
+ - HTML tags removed
20
+ - URLs removed
21
+ - Special characters (other than letters, digits, and spaces) removed
22
+ - Multiple consecutive spaces replaced with a single space
23
+ - Leading and trailing whitespace removed
24
+ - Extra spaces between words reduced to a single space
25
+
26
+ Example:
27
+ --------
28
+ >>> clean_text("<p>Hello <b>World</b>! Visit http://example.com for more info.</p>")
29
+ 'Hello World Visit for more info'
30
+ """
31
+
32
+ # Remove HTML tags
33
+ text = re.sub(r'<[^>]*?>', '', text)
34
+
35
+ # Remove URLs
36
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
37
+
38
+ # Remove special characters (anything that is not a letter, number, or space)
39
+ text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
40
+
41
+ # Replace multiple spaces with a single space
42
+ text = re.sub(r'\s{2,}', ' ', text)
43
+
44
+ # Trim leading and trailing whitespace
45
+ text = text.strip()
46
+
47
+ # Remove extra whitespace between words (in case of multiple spaces)
48
+ text = ' '.join(text.split())
49
+
50
+ return text