Spaces:

Bhashini-IITJ
/

IndicPhotoOCR

Running

App Files Files Community

anikde commited on 14 days ago

Commit

75b1563

1 Parent(s): 1b11f07

vit model added

Browse files

Files changed (10) hide show

.gitignore +1 -0
IndicPhotoOCR/detection/east_detector.py +11 -10
IndicPhotoOCR/ocr.py +35 -14
IndicPhotoOCR/script_identification/vit/__init__.py +0 -0
IndicPhotoOCR/script_identification/vit/config.py +58 -0
IndicPhotoOCR/script_identification/vit/vit_infer.py +213 -0
IndicPhotoOCR/utils/__init__.py +0 -0
IndicPhotoOCR/utils/helper.py +220 -0
app.py +33 -17
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -163,6 +163,7 @@ IndicPhotoOCR/recognition/models
 IndicPhotoOCR/script_identification/images
 IndicPhotoOCR/script_identification/models
 build/

 IndicPhotoOCR/script_identification/images
 IndicPhotoOCR/script_identification/models
+IndicPhotoOCR/script_identification/vit/models
 build/

IndicPhotoOCR/detection/east_detector.py CHANGED Viewed

@@ -74,14 +74,15 @@ class EASTdetector:
         return bbox_result_dict
-# if __name__ == "__main__":
-#     import argparse
-#     parser = argparse.ArgumentParser(description='Text detection using EAST model')
-#     parser.add_argument('--image_path', type=str, required=True, help='Path to the input image')
-#     parser.add_argument('--device', type=str, default='cpu', help='Device to run the model on, e.g., "cpu" or "cuda"')
-#     parser.add_argument('--model_checkpoint', type=str, required=True, help='Path to the model checkpoint file')
-#     args = parser.parse_args()
-#     # Run prediction and get results as dictionary
-#     detection_result = predict(args.image_path, args.device, args.model_checkpoint)
-#     print(detection_result)

         return bbox_result_dict
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Text detection using EAST model')
+    parser.add_argument('--image_path', type=str, required=True, help='Path to the input image')
+    parser.add_argument('--device', type=str, default='cpu', help='Device to run the model on, e.g., "cpu" or "cuda"')
+    parser.add_argument('--model_checkpoint', type=str, required=True, help='Path to the model checkpoint file')
+    args = parser.parse_args()
+    # Run prediction and get results as dictionary
+    east = EASTdetector(model_path = args.model_checkpoint)
+    detection_result = east.detect(args.image_path, args.model_checkpoint, args.device)
+    # print(detection_result)

IndicPhotoOCR/ocr.py CHANGED Viewed

@@ -7,11 +7,14 @@ import numpy as np
 # from IndicPhotoOCR.detection.east_detector import EASTdetector
-from IndicPhotoOCR.script_identification.CLIP_identifier import CLIPidentifier
 from IndicPhotoOCR.recognition.parseq_recogniser import PARseqrecogniser
 import IndicPhotoOCR.detection.east_config as cfg
 from IndicPhotoOCR.detection.textbpn.textbpnpp_detector import TextBPNpp_detector
 class OCR:
     def __init__(self, device='cuda:0', verbose=False):
@@ -22,7 +25,8 @@ class OCR:
         # self.detector = EASTdetector()
         self.detector = TextBPNpp_detector(device=self.device)
         self.recogniser = PARseqrecogniser()
-        self.identifier = CLIPidentifier()
     # def detect(self, image_path, detect_model_checkpoint=cfg.checkpoint):
     #     """Run the detection model to get bounding boxes of text areas."""
@@ -123,6 +127,7 @@ class OCR:
     def ocr(self, image_path):
         """Process the image by detecting text areas, identifying script, and recognizing text."""
         recognized_words = []
         image = Image.open(image_path)
@@ -130,25 +135,41 @@ class OCR:
         detections = self.detect(image_path)
         # Process each detected text area
-        for bbox in detections:
-            # Crop and identify script language
-            script_lang, cropped_path = self.crop_and_identify_script(image, bbox)
-            # Check if the script language is valid
-            if script_lang:
-                # Recognize text
-                recognized_word = self.recognise(cropped_path, script_lang)
-                recognized_words.append(recognized_word)
-                if self.verbose:
-                    print(f"Recognized word: {recognized_word}")
-        return recognized_words
 if __name__ == '__main__':
     # detect_model_checkpoint = 'bharatSTR/East/tmp/epoch_990_checkpoint.pth.tar'
-    sample_image_path = 'test_images/image_141.jpg'
     cropped_image_path = 'test_images/cropped_image/image_141_0.jpg'
     ocr = OCR(device="cuda", verbose=False)

 # from IndicPhotoOCR.detection.east_detector import EASTdetector
+# from IndicPhotoOCR.script_identification.CLIP_identifier import CLIPidentifier
+from IndicPhotoOCR.script_identification.vit.vit_infer import VIT_identifier
 from IndicPhotoOCR.recognition.parseq_recogniser import PARseqrecogniser
 import IndicPhotoOCR.detection.east_config as cfg
 from IndicPhotoOCR.detection.textbpn.textbpnpp_detector import TextBPNpp_detector
+from IndicPhotoOCR.utils.helper import detect_para
 class OCR:
     def __init__(self, device='cuda:0', verbose=False):
         # self.detector = EASTdetector()
         self.detector = TextBPNpp_detector(device=self.device)
         self.recogniser = PARseqrecogniser()
+        # self.identifier = CLIPidentifier()
+        self.identifier = VIT_identifier()
     # def detect(self, image_path, detect_model_checkpoint=cfg.checkpoint):
     #     """Run the detection model to get bounding boxes of text areas."""
     def ocr(self, image_path):
         """Process the image by detecting text areas, identifying script, and recognizing text."""
+        recognized_texts = {}
         recognized_words = []
         image = Image.open(image_path)
         detections = self.detect(image_path)
         # Process each detected text area
+        # for bbox in detections:
+            # # Crop and identify script language
+            # script_lang, cropped_path = self.crop_and_identify_script(image, bbox)
+            # # Check if the script language is valid
+            # if script_lang:
+            #     # Recognize text
+            #     recognized_word = self.recognise(cropped_path, script_lang)
+            #     recognized_words.append(recognized_word)
+            #     if self.verbose:
+            #         print(f"Recognized word: {recognized_word}")
+        for id, bbox in enumerate(detections):
+            # Identify the script and crop the image to this region
+            script_lang, cropped_path = self.crop_and_identify_script(image, bbox)
+            # Calculate bounding box coordinates
+            x1 = min([bbox[i][0] for i in range(len(bbox))])
+            y1 = min([bbox[i][1] for i in range(len(bbox))])
+            x2 = max([bbox[i][0] for i in range(len(bbox))])
+            y2 = max([bbox[i][1] for i in range(len(bbox))])
+            if script_lang:
+                recognized_text = self.recognise(cropped_path, script_lang)
+                recognized_texts[f"img_{id}"] = {"txt": recognized_text, "bbox": [x1, y1, x2, y2]}
+        return detect_para(recognized_texts)
+        # return recognized_words
 if __name__ == '__main__':
     # detect_model_checkpoint = 'bharatSTR/East/tmp/epoch_990_checkpoint.pth.tar'
+    sample_image_path = 'test_images/image_88.jpg'
     cropped_image_path = 'test_images/cropped_image/image_141_0.jpg'
     ocr = OCR(device="cuda", verbose=False)

IndicPhotoOCR/script_identification/vit/__init__.py ADDED Viewed

File without changes

IndicPhotoOCR/script_identification/vit/config.py ADDED Viewed

	@@ -0,0 +1,58 @@

+common_config={
+    'pretrained_vit_model': 'google/vit-base-patch16-224-in21k'
+}
+train_config = {
+    'epochs': 20,
+    'max_images_real':1900,
+    'classes':12,
+    'hindi_path_real': '<path_for_hindi_dataset>',
+    'english_path_real':'<path_for_eng_dataset>',
+    'gujarati_path_real':'<path_for_gujarati_dataset>',
+    'punjabi_path_real':'<path_for_punjabi_dataset>',
+    'assamese_path_real':'<path_for_assamese_dataset>',
+    'bengali_path_real':'<path_for_bengali_dataset>',
+    'kannada_path_real':'<path_for_kannada_dataset>',
+    'malayalam_path_real':'<path_for_malayalam_dataset>',
+    'marathi_path_real':'<path_for_marathi_dataset>',
+    'odia_path_real':'<path_for_odia_dataset>',
+    'tamil_path_real':'<path_for_tamil_dataset>',
+    'telugu_path_real':'<path_for_telegu_dataset>',
+    'checkpoints_dir': '<path_for_model>'
+}
+train_config.update(common_config)
+test_config = {
+    'reload_model': '<path_for_model>',
+    'max_images':2000,
+    'classes':12,
+    'hindi_path_real': '<path_for_hindi_dataset>',
+    'english_path_real':'<path_for_eng_dataset>',
+    'gujarati_path_real':'<path_for_gujarati_dataset>',
+    'punjabi_path_real':'<path_for_punjabi_dataset>',
+    'assamese_path_real':'<path_for_assamese_dataset>',
+    'bengali_path_real':'<path_for_bengali_dataset>',
+    'kannada_path_real':'<path_for_kannada_dataset>',
+    'malayalam_path_real':'<path_for_malayalam_dataset>',
+    'marathi_path_real':'<path_for_marathi_dataset>',
+    'odia_path_real':'<path_for_odia_dataset>',
+    'tamil_path_real':'<path_for_tamil_dataset>',
+    'telugu_path_real':'<path_for_telegu_dataset>',
+}
+test_config.update(common_config)
+infer_config = {
+    'model_path':'<path_for_model>',
+    'img_path': 'image_path',
+    'folder_path':'<path_dataset_folder>',
+    'csv_path':'<csv_path>',
+}
+infer_config.update(common_config)

IndicPhotoOCR/script_identification/vit/vit_infer.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from transformers import AutoImageProcessor,ViTForImageClassification,pipeline
+from PIL import Image
+from datasets import DatasetDict,Dataset,ClassLabel
+import torchvision.transforms as transforms
+import numpy as np
+import csv
+import os
+import argparse
+import requests
+from tqdm import tqdm
+import zipfile
+import time
+import glob
+from IndicPhotoOCR.script_identification.vit.config import infer_config as config
+model_info = {
+    "hindi": {
+        "path": "models/hindienglish",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglish.zip",
+        "subcategories": ["hindi", "english"]
+    },
+    "assamese": {
+        "path": "models/hindienglishassamese",
+        "url": "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishassamese.zip",
+        "subcategories": ["hindi", "english", "assamese"]
+    },
+    "bengali": {
+        "path": "models/hindienglishbengali",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishbengali.zip",
+        "subcategories": ["hindi", "english", "bengali"]
+    },
+    "gujarati": {
+        "path": "models/hindienglishgujarati",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishgujarati.zip",
+        "subcategories": ["hindi", "english", "gujarati"]
+    },
+    "kannada": {
+        "path": "models/hindienglishkannada",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishkannada.zip",
+        "subcategories": ["hindi", "english", "kannada"]
+    },
+    "malayalam": {
+        "path": "models/hindienglishmalayalam",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishmalayalam.zip",
+        "subcategories": ["hindi", "english", "malayalam"]
+    },
+    "marathi": {
+        "path": "models/hindienglishmarathi",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishmarathi.zip",
+        "subcategories": ["hindi", "english", "marathi"]
+    },
+    "meitei": {
+        "path": "models/hindienglishmeitei",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishmeitei.zip",
+        "subcategories": ["hindi", "english", "meitei"]
+    },
+    "odia": {
+        "path": "models/hindienglishodia",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishodia.zip",
+        "subcategories": ["hindi", "english", "odia"]
+    },
+    "punjabi": {
+        "path": "models/hindienglishpunjabi",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishpunjabi.zip",
+        "subcategories": ["hindi", "english", "punjabi"]
+    },
+    "tamil": {
+        "path": "models/hindienglishtamil",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishtamil.zip",
+        "subcategories": ["hindi", "english", "tamil"]
+    },
+    "telugu": {
+        "path": "models/hindienglishtelugu",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/hindienglishtelugu.zip",
+        "subcategories": ["hindi", "english", "telugu"]
+    },
+    "12C": {
+        "path": "models/12_classes",
+        "url" : "https://github.com/Bhashini-IITJ/ScriptIdentification/releases/download/Vit_Models/12_classes.zip",
+        "subcategories": ["hindi", "english", "assamese","bengali","gujarati","kannada","malayalam","marathi","odia","punjabi","tamil","telegu"]
+    },
+}
+pretrained_vit_model = config['pretrained_vit_model']
+processor = AutoImageProcessor.from_pretrained(pretrained_vit_model,use_fast=True)
+class VIT_identifier:
+    def __init__(self):
+        pass
+    def unzip_file(self, zip_path, extract_to):
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+            print(f"Extracted files to {extract_to}")
+    def ensure_model(self, model_name):
+        model_path = model_info[model_name]["path"]
+        url = model_info[model_name]["url"]
+        root_model_dir = "IndicPhotoOCR/script_identification/vit"
+        model_path = os.path.join(root_model_dir, model_path)
+        if not os.path.exists(model_path):
+            print(f"Model not found locally. Downloading {model_name} from {url}...")
+            response = requests.get(url, stream=True)
+            zip_path = os.path.join(model_path, "temp_download.zip")
+            os.makedirs(model_path, exist_ok=True)
+            with open(zip_path, "wb") as file:
+                for chunk in response.iter_content(chunk_size=8192):
+                    file.write(chunk)
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(model_path)
+            os.remove(zip_path)
+            print(f"Downloaded and extracted to {model_path}")
+        else:
+            # print(f"Model folder already exists: {model_path}")
+            pass
+        return model_path
+    def identify(self, image_path,model_name):
+        model_path = self.ensure_model(model_name)
+        vit = ViTForImageClassification.from_pretrained(model_path)
+        model= pipeline('image-classification', model=vit, feature_extractor=processor,device=0)
+        if image_path.endswith((".png", ".jpg", ".jpeg")):
+            image = Image.open(image_path)
+            output = model(image)
+            predicted_label = max(output, key=lambda x: x['score'])['label']
+            # print(f"image_path: {image_path}, predicted_label: {predicted_label}\n")
+        return predicted_label
+    def predict_batch(self, image_dir,model_name,time_show,output_csv="prediction.csv"):
+        model_path = self.ensure_model(model_name)
+        vit = ViTForImageClassification.from_pretrained(model_path)
+        model= pipeline('image-classification', model=vit, feature_extractor=processor,device=0)
+        start_time = time.time()
+        results=[]
+        image_count=0
+        for filename in os.listdir(image_dir):
+            if filename.endswith((".png", ".jpg", ".jpeg")):
+                img_path = os.path.join(image_dir, filename)
+                image = Image.open(img_path)
+                output = model(image)
+                predicted_label = max(output, key=lambda x: x['score'])['label'].capitalize()
+                results.append({"Filepath": filename, "Language": predicted_label})
+                image_count+=1
+        elapsed_time = time.time() - start_time
+        if time_show:
+            print(f"Time taken to process {image_count} images: {elapsed_time:.2f} seconds")
+        with open(output_csv, mode="w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=["Filepath", "Language"])
+            writer.writeheader()
+            writer.writerows(results)
+        return output_csv
+# if __name__ == "__main__":
+#     # Argument parser for command line usage
+#     parser = argparse.ArgumentParser(description="Image classification using CLIP fine-tuned model")
+#     parser.add_argument("--image_path", type=str, help="Path to the input image")
+#     parser.add_argument("--image_dir", type=str, help="Path to the input image directory")
+#     parser.add_argument("--model_name", type=str, choices=model_info.keys(), help="Name of the model (e.g., hineng, hinengpun, hinengguj)")
+#     parser.add_argument("--batch", action="store_true", help="Process images in batch mode if specified")
+#     parser.add_argument("--time",type=bool, nargs="?", const=True, default=False, help="Prints the time required to process a batch of images")
+#     args = parser.parse_args()
+#     # Choose function based on the batch parameter
+#     if args.batch:
+#         if not args.image_dir:
+#             print("Error: image_dir is required when batch is set to True.")
+#         else:
+#             result = predict_batch(args.image_dir, args.model_name, args.time)
+#             print(result)
+#     else:
+#         if not args.image_path:
+#             print("Error: image_path is required when batch is not set.")
+#         else:
+#             result = predict(args.image_path, args.model_name)
+#             print(result)

IndicPhotoOCR/utils/__init__.py ADDED Viewed

File without changes

IndicPhotoOCR/utils/helper.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import numpy as np
+# def detect_para(bbox_dict):
+#     alpha1 = 0.2
+#     alpha2 = 0.7
+#     beta1 = 0.4
+#     data = bbox_dict
+#     word_crops = list(data.keys())
+#     for i in word_crops:
+#         data[i]["x1"], data[i]["y1"], data[i]["x2"], data[i]["y2"] = data[i]["bbox"]
+#         data[i]["xc"] = (data[i]["x1"] + data[i]["x2"]) / 2
+#         data[i]["yc"] = (data[i]["y1"] + data[i]["y2"]) / 2
+#         data[i]["w"] = data[i]["x2"] - data[i]["x1"]
+#         data[i]["h"] = data[i]["y2"] - data[i]["y1"]
+#     patch_info = {}
+#     while word_crops:
+#         img_name = word_crops[0].split("_")[0]
+#         word_crop_collection = [
+#             word_crop for word_crop in word_crops if word_crop.startswith(img_name)
+#         ]
+#         centroids = {}
+#         lines = []
+#         img_word_crops = word_crop_collection.copy()
+#         para = []
+#         while img_word_crops:
+#             clusters = []
+#             para_words_group = [
+#                 img_word_crops[0],
+#             ]
+#             added = [
+#                 img_word_crops[0],
+#             ]
+#             img_word_crops.remove(img_word_crops[0])
+#             ## determining the paragraph
+#             while added:
+#                 word_crop = added.pop()
+#                 for i in range(len(img_word_crops)):
+#                     word_crop_ = img_word_crops[i]
+#                     if (
+#                         abs(data[word_crop_]["yc"] - data[word_crop]["yc"])
+#                         < data[word_crop]["h"] * alpha1
+#                     ):
+#                         if data[word_crop]["xc"] > data[word_crop_]["xc"]:
+#                             if (data[word_crop]["x1"] - data[word_crop_]["x2"]) < data[
+#                                 word_crop
+#                             ]["h"] * alpha2:
+#                                 para_words_group.append(word_crop_)
+#                                 added.append(word_crop_)
+#                         else:
+#                             if (data[word_crop_]["x1"] - data[word_crop]["x2"]) < data[
+#                                 word_crop
+#                             ]["h"] * alpha2:
+#                                 para_words_group.append(word_crop_)
+#                                 added.append(word_crop_)
+#                     else:
+#                         if data[word_crop]["yc"] > data[word_crop_]["yc"]:
+#                             if (data[word_crop]["y1"] - data[word_crop_]["y2"]) < data[
+#                                 word_crop
+#                             ]["h"] * beta1 and (
+#                                 (
+#                                     (data[word_crop_]["x1"] < data[word_crop]["x2"])
+#                                     and (data[word_crop_]["x1"] > data[word_crop]["x1"])
+#                                 )
+#                                 or (
+#                                     (data[word_crop_]["x2"] < data[word_crop]["x2"])
+#                                     and (data[word_crop_]["x2"] > data[word_crop]["x1"])
+#                                 )
+#                                 or (
+#                                     (data[word_crop]["x1"] > data[word_crop_]["x1"])
+#                                     and (data[word_crop]["x2"] < data[word_crop_]["x2"])
+#                                 )
+#                             ):
+#                                 para_words_group.append(word_crop_)
+#                                 added.append(word_crop_)
+#                         else:
+#                             if (data[word_crop_]["y1"] - data[word_crop]["y2"]) < data[
+#                                 word_crop
+#                             ]["h"] * beta1 and (
+#                                 (
+#                                     (data[word_crop_]["x1"] < data[word_crop]["x2"])
+#                                     and (data[word_crop_]["x1"] > data[word_crop]["x1"])
+#                                 )
+#                                 or (
+#                                     (data[word_crop_]["x2"] < data[word_crop]["x2"])
+#                                     and (data[word_crop_]["x2"] > data[word_crop]["x1"])
+#                                 )
+#                                 or (
+#                                     (data[word_crop]["x1"] > data[word_crop_]["x1"])
+#                                     and (data[word_crop]["x2"] < data[word_crop_]["x2"])
+#                                 )
+#                             ):
+#                                 para_words_group.append(word_crop_)
+#                                 added.append(word_crop_)
+#                 img_word_crops = [p for p in img_word_crops if p not in para_words_group]
+#             ## processing for the line
+#             while para_words_group:
+#                 line_words_group = [
+#                     para_words_group[0],
+#                 ]
+#                 added = [
+#                     para_words_group[0],
+#                 ]
+#                 para_words_group.remove(para_words_group[0])
+#                 ## determining the line
+#                 while added:
+#                     word_crop = added.pop()
+#                     for i in range(len(para_words_group)):
+#                         word_crop_ = para_words_group[i]
+#                         if (
+#                             abs(data[word_crop_]["yc"] - data[word_crop]["yc"])
+#                             < data[word_crop]["h"] * alpha1
+#                         ):
+#                             if data[word_crop]["xc"] > data[word_crop_]["xc"]:
+#                                 if (data[word_crop]["x1"] - data[word_crop_]["x2"]) < data[
+#                                     word_crop
+#                                 ]["h"] * alpha2:
+#                                     line_words_group.append(word_crop_)
+#                                     added.append(word_crop_)
+#                             else:
+#                                 if (data[word_crop_]["x1"] - data[word_crop]["x2"]) < data[
+#                                     word_crop
+#                                 ]["h"] * alpha2:
+#                                     line_words_group.append(word_crop_)
+#                                     added.append(word_crop_)
+#                     para_words_group = [
+#                         p for p in para_words_group if p not in line_words_group
+#                     ]
+#                 xc = [data[word_crop]["xc"] for word_crop in line_words_group]
+#                 idxs = np.argsort(xc)
+#                 patch_cluster_ = [line_words_group[i] for i in idxs]
+#                 line_words_group = patch_cluster_
+#                 x1 = [data[word_crop]["x1"] for word_crop in line_words_group]
+#                 x2 = [data[word_crop]["x2"] for word_crop in line_words_group]
+#                 y1 = [data[word_crop]["y1"] for word_crop in line_words_group]
+#                 y2 = [data[word_crop]["y2"] for word_crop in line_words_group]
+#                 txt_line = [data[word_crop]["txt"] for word_crop in line_words_group]
+#                 txt = " ".join(txt_line)
+#                 x = [x1[0]]
+#                 y1_ = [y1[0]]
+#                 y2_ = [y2[0]]
+#                 l = [len(txt_l) for txt_l in txt_line]
+#                 for i in range(1, len(x1)):
+#                     x.append((x1[i] + x2[i - 1]) / 2)
+#                     y1_.append((y1[i] + y1[i - 1]) / 2)
+#                     y2_.append((y2[i] + y2[i - 1]) / 2)
+#                 x.append(x2[-1])
+#                 y1_.append(y1[-1])
+#                 y2_.append(y2[-1])
+#                 line_info = {
+#                     "x": x,
+#                     "y1": y1_,
+#                     "y2": y2_,
+#                     "l": l,
+#                     "txt": txt,
+#                     "word_crops": line_words_group,
+#                 }
+#                 clusters.append(line_info)
+#             y_ = [clusters[i]["y1"][0] for i in range(len(clusters))]
+#             idxs = np.argsort(y_)
+#             clusters_ = [clusters[i] for i in idxs]
+#             txt = [clusters[i]["txt"] for i in idxs]
+#             l = [len(t) for t in txt]
+#             txt = " ".join(txt)
+#             para_info = {"lines": clusters_, "l": l, "txt": txt}
+#             para.append(para_info)
+#         for word_crop in word_crop_collection:
+#             word_crops.remove(word_crop)
+#         return "\n".join([para[i]["txt"] for i in range(len(para))])
+def detect_para(recognized_texts):
+    """
+    Sort words into lines based on horizontal overlap of bounding boxes.
+    Args:
+        recognized_texts (dict): A dictionary with recognized texts as keys and bounding boxes as values.
+                                 Each bounding box is a list of points [x1, y1, x2, y2].
+    Returns:
+        list: A list of lists where each sublist contains words sorted by x-coordinate for a single line.
+    """
+    def calculate_overlap(bbox1, bbox2):
+        """Calculate the vertical overlap between two bounding boxes."""
+        # Extract bounding box coordinates
+        x1_1, y1_1, x2_1, y2_1 = bbox1
+        x1_2, y1_2, x2_2, y2_2 = bbox2
+        overlap = max(0, min(y2_1, y2_2) - max(y1_1, y1_2))
+        height = min(y2_1 - y1_1, y2_2 - y1_2)
+        return overlap / height if height > 0 else 0
+    # Convert recognized_texts dictionary to a list of tuples for processing
+    items = list(recognized_texts.items())
+    lines = []
+    while items:
+        current_image, current_data = items.pop(0)
+        current_text, current_bbox = current_data['txt'], current_data['bbox']
+        current_line = [(current_text, current_bbox)]
+        remaining_items = []
+        for image, data in items:
+            text, bbox = data['txt'], data['bbox']
+            if calculate_overlap(current_bbox, bbox) > 0.4:
+                current_line.append((text, bbox))
+            else:
+                remaining_items.append((image, data))
+        items = remaining_items
+        lines.append(current_line)
+    # Sort words within each line based on x1 (horizontal position)
+    sorted_lines = [
+        [text for text, bbox in sorted(line, key=lambda x: x[1][0])] for line in lines
+    ]
+    return sorted_lines

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
-# This is a working demo - textbpn++ - CLIP - Parseq
 import gradio as gr
 from PIL import Image
 import os
 from IndicPhotoOCR.ocr import OCR  # Ensure OCR class is saved in a file named ocr.py
 from IndicPhotoOCR.theme import Seafoam
 # Initialize the OCR object for text detection and recognition
-ocr = OCR(device="cpu", verbose=False)
 def process_image(image):
     """
@@ -36,21 +36,37 @@ def process_image(image):
     output_image = Image.open("output_image.png")
     # Initialize list to hold recognized text from each detected area
-    recognized_texts = []
     pil_image = Image.open(image_path)
-    # Process each detected bounding box for script identification and text recognition
-    for bbox in detections:
         # Identify the script and crop the image to this region
         script_lang, cropped_path = ocr.crop_and_identify_script(pil_image, bbox)
-        if script_lang:  # Only proceed if a script language is identified
-            # Recognize text in the cropped area
             recognized_text = ocr.recognise(cropped_path, script_lang)
-            recognized_texts.append(recognized_text)
     # Combine recognized texts into a single string for display
-    recognized_texts_combined = " ".join(recognized_texts)
     return output_image, recognized_texts_combined
 # Custom HTML for interface header with logos and alignment
@@ -110,10 +126,10 @@ demo = gr.Interface(
     examples=examples
 )
-# # Server setup and launch configuration
-# if __name__ == "__main__":
-#     server = "0.0.0.0"  # IP address for server
-#     port = 7865  # Port to run the server on
-#     demo.launch(server_name=server, server_port=port)
-demo.launch()

 import gradio as gr
 from PIL import Image
 import os
 from IndicPhotoOCR.ocr import OCR  # Ensure OCR class is saved in a file named ocr.py
 from IndicPhotoOCR.theme import Seafoam
+from IndicPhotoOCR.utils.helper import detect_para
 # Initialize the OCR object for text detection and recognition
+ocr = OCR(verbose=False)
 def process_image(image):
     """
     output_image = Image.open("output_image.png")
     # Initialize list to hold recognized text from each detected area
+    recognized_texts = {}
     pil_image = Image.open(image_path)
+    # # Process each detected bounding box for script identification and text recognition
+    # for bbox in detections:
+    #     # Identify the script and crop the image to this region
+    #     script_lang, cropped_path = ocr.crop_and_identify_script(pil_image, bbox)
+    #     if script_lang:  # Only proceed if a script language is identified
+    #         # Recognize text in the cropped area
+    #         recognized_text = ocr.recognise(cropped_path, script_lang)
+    #         recognized_texts.append(recognized_text)
+    for id, bbox in enumerate(detections):
         # Identify the script and crop the image to this region
         script_lang, cropped_path = ocr.crop_and_identify_script(pil_image, bbox)
+        # Calculate bounding box coordinates
+        x1 = min([bbox[i][0] for i in range(len(bbox))])
+        y1 = min([bbox[i][1] for i in range(len(bbox))])
+        x2 = max([bbox[i][0] for i in range(len(bbox))])
+        y2 = max([bbox[i][1] for i in range(len(bbox))])
+        if script_lang:
             recognized_text = ocr.recognise(cropped_path, script_lang)
+            recognized_texts[f"img_{id}"] = {"txt": recognized_text, "bbox": [x1, y1, x2, y2]}
     # Combine recognized texts into a single string for display
+    # recognized_texts_combined = " ".join(recognized_texts)
+    string = detect_para(recognized_texts)
+    recognized_texts_combined = '\n'.join([' '.join(line) for line in string])
     return output_image, recognized_texts_combined
 # Custom HTML for interface header with logos and alignment
     examples=examples
 )
+# Server setup and launch configuration
+if __name__ == "__main__":
+    server = "0.0.0.0"  # IP address for server
+    port = 7865  # Port to run the server on
+    demo.launch(server_name=server, server_port=port, share=True)
+# demo.launch()

requirements.txt CHANGED Viewed

@@ -43,4 +43,5 @@ torch==2.5.0
 torchvision==0.20.0
 easydict==1.13
 scipy==1.13.1

 torchvision==0.20.0
 easydict==1.13
 scipy==1.13.1
+transformers==4.45.1
+datasets==3.1.0