impresso-project
/

ner-stacked-bert-multilingual

@@ -528,7 +528,7 @@ def remove_trailing_stopwords(entities):
     and repairs the lOffset and rOffset accordingly.
     """
     if DEBUG:
-        print(f"Initial entities: {len(entities)}")
     new_entities = []
     for entity in entities:
         if "comp" not in entity["type"]:
@@ -540,7 +540,6 @@ def remove_trailing_stopwords(entities):
             rOffset = entity.get("rOffset", original_len)
             # Remove stopwords and punctuation from the beginning
-            i = 0
             while entity_text and (
                 entity_text.split()[0].lower() in stop_words
                 or entity_text[0] in punctuation
@@ -562,48 +561,36 @@ def remove_trailing_stopwords(entities):
                         print(
                             f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
                         )
-                i += 1
-            i = 0
             # Remove stopwords and punctuation from the end
-            iteration = 0
-            max_iterations = len(entity_text)  # Prevent infinite loops
-            while entity_text and iteration < max_iterations:
-                # Check if the last word is a stopword or the last character is punctuation
-                last_word = entity_text.split()[-1] if entity_text.split() else ""
-                last_char = entity_text[-1]
-                if last_word.lower() in stop_words:
-                    # Remove trailing stopword and adjust rOffset
-                    stopword_len = len(last_word) + 1  # Include space before stopword
-                    entity_text = entity_text[:-stopword_len].rstrip()
-                    rOffset -= stopword_len
-                    if DEBUG:
-                        print(
-                            f"Removed trailing stopword from entity: {entity_text} (rOffset={rOffset})"
-                        )
-                elif last_char in punctuation:
-                    # Remove trailing punctuation and adjust rOffset
-                    entity_text = entity_text[:-1].rstrip()
-                    rOffset -= 1
-                    if DEBUG:
-                        print(
-                            f"Removed trailing punctuation from entity: {entity_text} (rOffset={rOffset})"
-                        )
-                else:
-                    # Exit loop if neither stopwords nor punctuation are found
-                    break
-                iteration += 1
-                # print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
-            if len(entity_text.strip()) == 1:
-                entities.remove(entity)
-                if DEBUG:
-                    print(f"Skipping entity: {entity_text}")
-                continue
             # Skip certain entities based on rules
             if entity_text in string.punctuation:
                 if DEBUG:
@@ -682,10 +669,12 @@ def remove_trailing_stopwords(entities):
                 entities.remove(entity)
             else:
                 new_entities.append(entity)
     if DEBUG:
-        print(f"Remained entities: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):

     and repairs the lOffset and rOffset accordingly.
     """
     if DEBUG:
+        print(f"Initial entities in remove_trailing_stopwords: {len(entities)}")
     new_entities = []
     for entity in entities:
         if "comp" not in entity["type"]:
             rOffset = entity.get("rOffset", original_len)
             # Remove stopwords and punctuation from the beginning
             while entity_text and (
                 entity_text.split()[0].lower() in stop_words
                 or entity_text[0] in punctuation
                         print(
                             f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
                         )
             # Remove stopwords and punctuation from the end
+            if len(entity_text.strip()) > 1:
+                while entity_text and (
+                    entity_text.split()[-1].lower() in stop_words
+                    or entity_text[-1] in punctuation
+                ):
+                    if entity_text.split()[-1].lower() in stop_words:
+                        stopword_len = (
+                            len(entity_text.split()[-1]) + 1
+                        )  # Adjust length for stopword and preceding space
+                        entity_text = entity_text[
+                            :-stopword_len
+                        ]  # Remove trailing stopword
+                        rOffset -= stopword_len  # Adjust the right offset
+                        if DEBUG:
+                            print(
+                                f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
+                            )
+                    if entity_text:
+                        if entity_text[-1] in punctuation:
+                            entity_text = entity_text[
+                                :-1
+                            ]  # Remove trailing punctuation
+                            rOffset -= 1  # Adjust the right offset
+                            if DEBUG:
+                                print(
+                                    f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
+                                )
             # Skip certain entities based on rules
             if entity_text in string.punctuation:
                 if DEBUG:
                 entities.remove(entity)
             else:
                 new_entities.append(entity)
+        else:
+            new_entities.append(entity)
     if DEBUG:
+        print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):