Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Section Generator | |
| Handles dynamic section generation based on template sections | |
| """ | |
| from typing import List | |
| from langchain.prompts import ChatPromptTemplate | |
| def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate: | |
| """Create a dynamic prompt based on the sections found in the template.""" | |
| # Build section instructions dynamically | |
| section_instructions = [] | |
| for section in template_sections: | |
| section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip() | |
| section_instructions.append( | |
| f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]") | |
| sections_text = "\n\n".join(section_instructions) | |
| # Clean section names for display | |
| clean_section_names = [s.strip().replace('\xa0', ' ').replace( | |
| ':', '').strip() for s in template_sections] | |
| sections_list = ', '.join(clean_section_names) | |
| # Special handling for single section templates | |
| if len(template_sections) == 1: | |
| single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}. | |
| You MUST generate content for this section using ALL the information from the transcription. | |
| Do not leave the section empty - extract and organize ALL relevant content from the transcription.""" | |
| else: | |
| single_section_instruction = "" | |
| system_prompt = f"""You are a medical document organizer. | |
| Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text. | |
| You MUST fill ALL sections requested in the template: {sections_list}. | |
| CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them. | |
| DO NOT summarize, interpret, or add information not present in the transcription. | |
| DO NOT use markdown formatting or add extra headers. | |
| {single_section_instruction} | |
| ORGANIZATION RULES: | |
| - Extract relevant content from the transcription for each section | |
| - Maintain the original wording and structure as much as possible | |
| - Do not add medical interpretations or conclusions not present in the text | |
| - Keep all measurements, observations, and findings exactly as stated | |
| - Preserve the original medical terminology | |
| - Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :') | |
| - If there is only one section, put ALL relevant content in that section | |
| Format your response with clear section headers using the EXACT names from the template: | |
| {sections_text} | |
| IMPORTANT: | |
| - Use the corrected transcription content to fill all sections | |
| - Use the EXACT section names from the template - DO NOT translate or modify them | |
| - Do not add markdown formatting or extra headers | |
| - Maintain maximum fidelity to the original transcription content | |
| - Do not summarize or interpret the medical information | |
| - Keep all original medical terms and measurements exactly as they appear | |
| - NEVER leave a section empty - always provide content based on the transcription""" | |
| human_prompt = """Organize the corrected medical transcription into the required sections: | |
| Template sections: {template_sections} | |
| Medical data: {medical_data} | |
| Corrected transcription: {corrected_transcription} | |
| Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text.""" | |
| return ChatPromptTemplate.from_messages([ | |
| ("system", system_prompt), | |
| ("human", human_prompt) | |
| ]) | |
| def fix_section_names(content: str, template_sections: List[str]) -> str: | |
| """Post-process the generated content to ensure exact section names are used.""" | |
| import re | |
| # If content is empty or very short, return the original content | |
| if not content.strip() or len(content.strip()) < 50: | |
| return content | |
| # If there's only one template section, put all content in that section | |
| if len(template_sections) == 1: | |
| return f"{template_sections[0]}\n{content.strip()}" | |
| sections = {} | |
| current_section = None | |
| current_content = [] | |
| # Create a pattern to match any section header | |
| section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE) | |
| for line in content.split('\n'): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # Check if this is a section header | |
| match = section_pattern.match(line) | |
| if match: | |
| section_name = match.group(1).strip() | |
| # Normalize section names for comparison | |
| section_normalized = section_name.lower().replace('é', 'e').replace( | |
| 'è', 'e').replace('à', 'a').replace(':', '').strip() | |
| # Check if this section name is similar to any template section | |
| matched_template_section = None | |
| for template_section in template_sections: | |
| template_normalized = template_section.lower().replace('é', 'e').replace( | |
| 'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip() | |
| # Check if they are similar (case insensitive and accent-insensitive) | |
| if (section_normalized in template_normalized or | |
| template_normalized in section_normalized or | |
| any(word in section_normalized for word in template_normalized.split())): | |
| matched_template_section = template_section | |
| break | |
| if matched_template_section: | |
| if current_section: | |
| sections[current_section] = '\n'.join( | |
| current_content).strip() | |
| current_section = matched_template_section # Use exact template section name | |
| current_content = [] | |
| else: | |
| # If no match found, treat as content | |
| if current_section: | |
| current_content.append(line) | |
| elif current_section: | |
| current_content.append(line) | |
| # Add last section | |
| if current_section and current_content: | |
| sections[current_section] = '\n'.join(current_content).strip() | |
| # If no sections were found, put all content in the first template section | |
| if not sections and template_sections: | |
| sections[template_sections[0]] = content.strip() | |
| # Reconstruct the content with exact section names | |
| fixed_content = [] | |
| for section_name, content in sections.items(): | |
| fixed_content.append(f"{section_name}") | |
| if content: | |
| fixed_content.append(content) | |
| fixed_content.append("") | |
| return "\n".join(fixed_content) | |