Spaces:

nangelov
/

ai-career-coach

Runtime error

ai-career-coach / output_parser.py

Nikolay Angelov

fix prompt

4762752 4 months ago

17.7 kB

	from langchain.agents.output_parsers import ReActSingleInputOutputParser
	from langchain.schema import AgentAction, AgentFinish
	import re
	import logging

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)

	class FlexibleOutputParser(ReActSingleInputOutputParser):
	def parse(self, text):
	# Clean up the text first
	text = text.strip()
	logging.info(f"Raw LLM output: {text[:200]}...") # Debug output

	# CRITICAL: Detect if LLM generated both Action and Final Answer
	has_action = "Action:" in text
	has_final_answer = "Final Answer:" in text

	if has_action and has_final_answer:
	logging.info("ERROR: LLM generated both Action and Final Answer - extracting Action only")
	# Extract only the Action part, ignore Final Answer
	lines = text.split('\n')
	action_lines = []
	for line in lines:
	if line.strip().startswith("Final Answer:"):
	break # Stop at Final Answer
	action_lines.append(line)
	text = '\n'.join(action_lines).strip()
	logging.info(f"Cleaned text: {text}")

	# Remove any trailing "Observation" that appears without content
	text = re.sub(r'\nObservation\s:?\s$', '', text)
	text = re.sub(r'Observation\s:?\s$', '', text)

	# Handle direct responses (greetings, simple questions)
	if not any(keyword in text for keyword in ["Thought:", "Action:", "Final Answer:"]):
	return AgentFinish(
	return_values={"output": text},
	log=text
	)

	# Handle responses that start with Final Answer directly
	if text.startswith("Final Answer:"):
	final_answer = text.replace("Final Answer:", "").strip()
	return AgentFinish(
	return_values={"output": final_answer},
	log=text
	)

	try:
	# Try standard ReAct parsing first
	return super().parse(text)
	except Exception as e:
	logging.info(f"Standard parsing failed: {e}")
	# Custom parsing for malformed ReAct format
	return self._parse_malformed_react(text)

	def _handle_truncated_action(self, text):
	"""Handle cases where Action is cut off"""
	if "Action:" in text and "Action Input:" not in text:
	# Action was truncated, treat as final answer
	return AgentFinish(
	return_values={"output": "I need to search for more information. Could you please ask your question again?"},
	log=text
	)
	return None

	def _parse_malformed_react(self, text):
	"""Handle malformed ReAct format responses"""

	# Clean special tokens from the entire text first
	text = re.sub(r'<\\|eot_id\\|>', '', text)
	text = re.sub(r'<\\|eom_id\\|>', '', text)
	text = re.sub(r'<\\|.*?\\|>', '', text)
	text = text.strip()

	# NEW: Handle "Action: None" case - this should be a final answer
	if "Action: None" in text:
	logging.info("Detected 'Action: None' - treating as final answer")
	# Extract everything before "Action: None"
	final_text = text.split("Action: None")[0].strip()
	return AgentFinish(
	return_values={"output": final_text},
	log=text
	)

	# Check if this is a response that should be a Final Answer but is missing the prefix
	if self._should_be_final_answer(text):
	logging.info("Detected response that should be Final Answer - treating as final answer")
	return AgentFinish(
	return_values={"output": text},
	log=text
	)

	# Check if this looks like job search results that should be preserved as-is
	if self._is_job_search_result(text) and "Action:" not in text:
	logging.info("Detected job search results in full text - preserving formatting")
	return AgentFinish(
	return_values={"output": text},
	log=text
	)

	lines = [line.strip() for line in text.split('\n') if line.strip()]

	thought = ""
	action = ""
	action_input = ""
	final_answer = ""

	i = 0
	while i < len(lines):
	line = lines[i]

	if line.startswith("Thought:"):
	thought = line.replace("Thought:", "").strip()

	elif line.startswith("Action:"):
	action = line.replace("Action:", "").strip()

	# NEW: If action is "None", treat everything before as final answer
	if action.lower() == "none":
	logging.info("Found 'Action: None' - treating preceding text as final answer")
	# Get all text before this Action line
	preceding_lines = lines[:i]
	final_text = "\n".join(preceding_lines).strip()
	# Remove "Thought:" prefix if present
	if final_text.startswith("Thought:"):
	final_text = final_text.replace("Thought:", "").strip()
	return AgentFinish(
	return_values={"output": final_text},
	log=text
	)

	elif line.startswith("Action Input:"):
	action_input = line.replace("Action Input:", "").strip()

	# Handle multi-line action input (especially for code blocks)
	j = i + 1
	while j < len(lines):
	next_line = lines[j]
	# Stop if we hit another ReAct keyword
	if any(next_line.startswith(keyword) for keyword in ["Thought:", "Action:", "Observation:", "Final Answer:", "Human:"]):
	break
	action_input += "\n" + next_line
	j += 1
	i = j - 1 # Adjust index to account for consumed lines

	# Clean action input more aggressively
	action_input = self._clean_action_input(action_input)

	elif line.startswith("Final Answer:"):
	final_answer = line.replace("Final Answer:", "").strip()

	# Collect multi-line final answer
	j = i + 1
	while j < len(lines):
	if lines[j].strip() and not lines[j].startswith(("Thought:", "Action:", "Observation:", "Human:")):
	final_answer += " " + lines[j].strip()
	j += 1
	else:
	break

	i += 1

	# Check for incomplete action
	if action and not action_input and action.lower() != "none":
	logging.info("ERROR: Action without Action Input - treating as final answer")
	return AgentFinish(
	return_values={"output": "I apologize, but I need more information to help you properly. Could you please rephrase your question?"},
	log=text
	)

	# If we have action and action_input, return AgentAction
	if action and action_input and action.lower() != "none":
	logging.info(f"Parsed Action: {action}")
	logging.info(f"Parsed Action Input: '{action_input[:100]}...'")
	return AgentAction(
	tool=action,
	tool_input=action_input,
	log=text
	)

	# If we have final_answer, return AgentFinish
	if final_answer:
	return AgentFinish(
	return_values={"output": final_answer},
	log=text
	)

	# Fallback: return the whole text as final answer
	return AgentFinish(
	return_values={"output": text},
	log=text
	)

	def _should_be_final_answer(self, text):
	"""Check if this text should be treated as a final answer"""

	# If it contains ReAct keywords, it's not a final answer
	has_action_keywords = any(keyword in text for keyword in ["Action:", "Thought:", "Action Input:"])
	if has_action_keywords:
	return False

	# If it's just a greeting or simple response, treat as final answer
	if len(text.strip()) < 50:
	return True

	# If it contains structured content (multiple sentences/paragraphs), likely a final answer
	sentences = text.split('.')
	if len(sentences) > 2:
	return True

	# If it contains newlines (structured response), likely a final answer
	if '\n' in text.strip():
	return True

	# Default to treating as final answer if no clear ReAct structure
	return True

	def _clean_action_input(self, action_input: str) -> str:
	"""Clean action input by removing special tokens and unwanted text"""
	# Remove special tokens
	action_input = re.sub(r'<\\|eot_id\\|>', '', action_input)
	action_input = re.sub(r'<\\|eom_id\\|>', '', action_input)
	action_input = re.sub(r'<\\|.*?\\|>', '', action_input)

	# Handle code blocks properly
	if "```" in action_input:
	# Extract content between triple backticks
	code_match = re.search(r'```(?:python)?\s(.?)\s*```', action_input, re.DOTALL)
	if code_match:
	action_input = code_match.group(1).strip()
	else:
	# If we have opening ``` but no closing, take everything after it
	if action_input.count('```') == 1:
	parts = action_input.split('```', 1)
	if len(parts) > 1:
	action_input = parts[1].strip()
	# Remove any language identifier
	if action_input.startswith('python'):
	action_input = action_input[6:].strip()
	elif action_input.startswith('\npython'):
	action_input = action_input[7:].strip()

	# Remove everything after common stop words (but preserve code structure)
	stop_words = ['Thought:', 'Action:', 'Observation:', 'Human:', 'Assistant:', 'Please see below', 'Now that we']
	for stop_word in stop_words:
	if stop_word in action_input:
	action_input = action_input.split(stop_word)[0]

	# Remove quotes if they wrap the entire input (but not if they're part of code)
	if not ('"""' in action_input or "'''" in action_input or 'print(' in action_input):
	action_input = re.sub(r'^["\']\|["\']$', '', action_input)

	# Remove any trailing special characters or whitespace
	action_input = action_input.strip()

	# Remove any trailing text that looks like instructions
	lines = action_input.split('\n')
	cleaned_lines = []
	skip_rest = False

	for line in lines:
	# Skip lines that look like instructions or comments after code
	if (skip_rest or
	line.strip().startswith('Please see below') or
	line.strip().startswith('Now that we') or
	line.strip().startswith('Based on the output') or
	line.strip().startswith('After running') or
	line.strip().startswith('The output') or
	line.strip().startswith('Here is') or
	line.strip().startswith('Let me') or
	line.strip().startswith('I will') or
	line.strip().startswith('Next,') or
	line.strip().startswith('Then,') or
	line.strip().startswith('Finally,') or
	re.match(r'^\s#.instructions.*', line, re.IGNORECASE) or
	re.match(r'^\s#.note.*', line, re.IGNORECASE)):
	skip_rest = True
	break
	cleaned_lines.append(line)

	action_input = '\n'.join(cleaned_lines).strip()

	# Final cleanup - remove any remaining trailing instructions
	action_input = re.sub(r'\n\sPlease.$', '', action_input, flags=re.DOTALL)
	action_input = re.sub(r'\n\sNow.$', '', action_input, flags=re.DOTALL)
	action_input = re.sub(r'\n\sBased on.$', '', action_input, flags=re.DOTALL)

	return action_input

	def _is_job_search_result(self, text):
	"""Check if the text contains job search results that should be preserved"""
	job_indicators = [
	"Job Title:", "Company:", "Location:", "job listings",
	"Found", "jobs", "position", "role", "employment",
	"**", "- Company:", "- Location:", "Link:"
	]
	return any(indicator.lower() in text.lower() for indicator in job_indicators)

	def clean_llm_response(output):
	"""Clean up the LLM response to extract just the user-facing part."""

	# Remove special tokens first
	if "<\|eot_id\|>" in output:
	output = output.split("<\|eot_id\|>")[0]

	if "<\|eom_id\|>" in output:
	output = output.split("<\|eom_id\|>")[0]

	# Remove "Human:" and everything after it (most aggressive cleaning)
	if "Human:" in output:
	output = output.split("Human:")[0].strip()

	# Remove trailing newlines and "Human" without colon
	output = re.sub(r'\sHuman\s$', '', output)
	output = re.sub(r'\s\n\s$', '', output)

	# If there's a Final Answer, extract only that part
	if "Final Answer:" in output:
	# Find the last occurrence of "Final Answer:" in case there are multiple
	last_final_answer_index = output.rindex("Final Answer:")
	final_answer = output[last_final_answer_index + len("Final Answer:"):].strip()

	# Clean up any remaining artifacts from final answer
	if "Human:" in final_answer:
	final_answer = final_answer.split("Human:")[0].strip()

	return final_answer

	# Remove any remaining internal instructions or notes
	lines = output.split("\n")
	cleaned_lines = []
	for line in lines:
	line = line.strip()
	# Skip lines that look like internal notes or instructions
	if (line.startswith("Let's") or "instead" in line.lower() or
	"attempt" in line.lower() or "should" in line.lower() or
	line.startswith("Human") or not line or
	line.startswith("I will") or line.startswith("Next,") or
	line.startswith("Then,") or line.startswith("Finally,")):
	continue
	cleaned_lines.append(line)

	result = "\n".join(cleaned_lines).strip()

	# Final cleanup
	result = re.sub(r'\n\sPlease.$', '', result, flags=re.DOTALL)
	result = re.sub(r'\n\sNow.$', '', result, flags=re.DOTALL)
	result = re.sub(r'\n\sBased on.$', '', result, flags=re.DOTALL)

	return result

	def validate_pdp_response(response_text: str) -> bool:
	"""
	Validate if the PDP response is properly formatted and complete
	"""
	# Check for minimum length
	if len(response_text.strip()) < 500:
	print("PDP_VALIDATION_FAILED on length:" ,len(response_text.strip()))
	return False

	# Check for required sections
	required_sections = [
	"Current Skills Assessment",
	"Skills Gap Analysis",
	"Learning Objectives",
	"Recommended Training",
	"Timeline",
	"Progress Tracking"
	]

	found_sections = 0
	for section in required_sections:
	if section.lower() in response_text.lower():
	found_sections += 1

	# Require at least 4 out of 6 sections
	if found_sections < 4:
	print("PDP_VALIDATION_FAILED on number of sections: ",found_sections)
	return False

	# Check for problematic patterns
	problematic_patterns = [
	"```python", # Unfinished code blocks
	"SyntaxError",
	"Action:",
	"Action Input:",
	"Observation:",
	"Let's correct this",
	"There was an issue",
	"skill_gaps =", # Incomplete variable assignments
	]

	for pattern in problematic_patterns:
	if pattern in response_text:
	print("PDP_VALIDATION_FAILED on pattern:" ,pattern)
	return False

	return True

	class PDPOutputParser(FlexibleOutputParser):
	def parse(self, text):
	print(f"PDPOutputParser: Processing {len(text)} characters")

	# Clean special tokens first
	text = re.sub(r'<\\|eot_id\\|>', '', text)
	text = re.sub(r'<\\|eom_id\\|>', '', text)
	text = re.sub(r'<\\|.*?\\|>', '', text)

	# For PDP, extract content before "Final Answer:"
	if "Final Answer:" in text:
	pdp_content = text.split("Final Answer:")[0].strip()
	else:
	pdp_content = text.strip()

	# Remove ReAct thinking patterns for clean PDP
	pdp_content = re.sub(r'Thought:.*?(?=##\|$)', '', pdp_content, flags=re.DOTALL)
	pdp_content = re.sub(r'Action:.*?(?=##\|$)', '', pdp_content, flags=re.DOTALL)

	# Extract just the structured PDP sections
	if "## Current Skills Assessment" in pdp_content:
	start_idx = pdp_content.find("## Current Skills Assessment")
	pdp_content = pdp_content[start_idx:].strip()

	# Clean up any remaining artifacts
	pdp_content = re.sub(r'\n\s\n\s\n', '\n\n', pdp_content) # Remove excessive newlines
	pdp_content = pdp_content.strip()

	print(f"PDPOutputParser: Cleaned to {len(pdp_content)} characters")

	return AgentFinish(
	return_values={"output": pdp_content},
	log=text
	)