Spaces:

doggdad
/

mmrag-hf

Sleeping

App Files Files Community

doggdad commited on 17 days ago

Commit

21cbf97

verified ·

1 Parent(s): c4a0945

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +4 -0
src/crud/.ipynb_checkpoints/vector_store-checkpoint.py +140 -0
src/crud/__pycache__/vector_store.cpython-313.pyc +0 -0
src/crud/vector_store.py +140 -0
src/data/.DS_Store +0 -0
src/data/images/car_1.jpg +3 -0
src/data/images/car_2.jpg +3 -0
src/data/images/cat_1.jpg +0 -0
src/data/images/cat_2.jpg +0 -0
src/data/images/cat_3.jpg +0 -0
src/data/images/motorcycle_1.jpg +0 -0
src/data/images/motorcycle_2.jpg +3 -0
src/data/images/motorcycle_3.jpg +3 -0
src/preprocess/.ipynb_checkpoints/embedding-checkpoint.py +69 -0
src/preprocess/.ipynb_checkpoints/preprocessing-checkpoint.py +65 -0
src/preprocess/__pycache__/embedding.cpython-313.pyc +0 -0
src/preprocess/__pycache__/preprocessing.cpython-313.pyc +0 -0
src/preprocess/embedding.py +69 -0
src/preprocess/preprocessing.py +65 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/data/images/car_1.jpg filter=lfs diff=lfs merge=lfs -text
+src/data/images/car_2.jpg filter=lfs diff=lfs merge=lfs -text
+src/data/images/motorcycle_2.jpg filter=lfs diff=lfs merge=lfs -text
+src/data/images/motorcycle_3.jpg filter=lfs diff=lfs merge=lfs -text

src/crud/.ipynb_checkpoints/vector_store-checkpoint.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import Any, Iterable, List, Optional
+from langchain_core.embeddings import Embeddings
+import uuid
+from langchain_community.vectorstores.lancedb import LanceDB
+class MultimodalLanceDB(LanceDB):
+    """`LanceDB` vector store to process multimodal data
+    Parameters:
+    -----------
+        connection: Any
+            LanceDB connection to use. If not provided, a new connection will be created.
+        embedding: Embeddings
+            Embedding to use for the vectorstore.
+        vector_key: str
+            Key to use for the vector in the database. Defaults to ``vector``.
+        id_key: str
+            Key to use for the id in the database. Defaults to ``id``.
+        text_key: str
+            Key to use for the text in the database. Defaults to ``text``.
+        image_path_key: str
+            Key to use for the path to image in the database. Defaults to ``image_path``.
+        table_name: str
+            Name of the table to use. Defaults to ``vectorstore``.
+        api_key: str
+            API key to use for LanceDB cloud database.
+        region: str
+            Region to use for LanceDB cloud database.
+        mode: str
+            Mode to use for adding data to the table. Defaults to ``overwrite``.
+    """
+    def __init__(
+        self,
+        connection: Optional[Any] = None,
+        embedding: Optional[Embeddings] = None,
+        uri: Optional[str] = "/tmp/lancedb",
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        api_key: Optional[str] = None,
+        region: Optional[str] = None,
+        mode: Optional[str] = "append",
+    ):
+        super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
+        self._image_path_key = image_path_key
+    def add_text_image_pairs(
+        self,
+        texts: Iterable[str],
+        image_paths: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Turn text-image pairs into embedding and add it to the database
+        Parameters:
+        ----------
+            texts: Iterable[str]
+                Iterable of strings to combine with corresponding images to add to the vectorstore.
+            images: Iterable[str]
+                Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
+            metadatas: List[str]
+                Optional list of metadatas associated with the texts.
+            ids: List[str]
+                Optional list of ids to associate with the texts.
+        Returns:
+        --------
+            List of ids of the added text-image pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
+        print(f'The length of texts is {len(texts)}')
+        # Embed texts and create documents
+        docs = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths))  # type: ignore
+        for idx, text in enumerate(texts):
+            embedding = embeddings[idx]
+            metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
+            docs.append(
+                {
+                    self._vector_key: embedding,
+                    self._id_key: ids[idx],
+                    self._text_key: text,
+                    self._image_path_key : image_paths[idx],
+                    "metadata": metadata,
+                }
+            )
+        print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
+        if 'mode' in kwargs:
+            mode = kwargs['mode']
+        else:
+            mode = self.mode
+        if self._table_name in self._connection.table_names():
+            tbl = self._connection.open_table(self._table_name)
+            if self.api_key is None:
+                tbl.add(docs)
+            else:
+                tbl.add(docs)
+        else:
+            self._connection.create_table(self._table_name, data=docs)
+        return ids
+    @classmethod
+    def from_text_image_pairs(
+        cls,
+        texts: List[str],
+        image_paths: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        connection: Any = None,
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        **kwargs: Any,
+    ):
+        instance = MultimodalLanceDB(
+            connection=connection,
+            embedding=embedding,
+            vector_key=vector_key,
+            id_key=id_key,
+            text_key=text_key,
+            image_path_key=image_path_key,
+            table_name=table_name,
+        )
+        instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
+        return instance

src/crud/__pycache__/vector_store.cpython-313.pyc ADDED Viewed

Binary file (6.15 kB). View file

src/crud/vector_store.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import Any, Iterable, List, Optional
+from langchain_core.embeddings import Embeddings
+import uuid
+from langchain_community.vectorstores.lancedb import LanceDB
+class MultimodalLanceDB(LanceDB):
+    """`LanceDB` vector store to process multimodal data
+    Parameters:
+    -----------
+        connection: Any
+            LanceDB connection to use. If not provided, a new connection will be created.
+        embedding: Embeddings
+            Embedding to use for the vectorstore.
+        vector_key: str
+            Key to use for the vector in the database. Defaults to ``vector``.
+        id_key: str
+            Key to use for the id in the database. Defaults to ``id``.
+        text_key: str
+            Key to use for the text in the database. Defaults to ``text``.
+        image_path_key: str
+            Key to use for the path to image in the database. Defaults to ``image_path``.
+        table_name: str
+            Name of the table to use. Defaults to ``vectorstore``.
+        api_key: str
+            API key to use for LanceDB cloud database.
+        region: str
+            Region to use for LanceDB cloud database.
+        mode: str
+            Mode to use for adding data to the table. Defaults to ``overwrite``.
+    """
+    def __init__(
+        self,
+        connection: Optional[Any] = None,
+        embedding: Optional[Embeddings] = None,
+        uri: Optional[str] = "/tmp/lancedb",
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        api_key: Optional[str] = None,
+        region: Optional[str] = None,
+        mode: Optional[str] = "append",
+    ):
+        super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
+        self._image_path_key = image_path_key
+    def add_text_image_pairs(
+        self,
+        texts: Iterable[str],
+        image_paths: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Turn text-image pairs into embedding and add it to the database
+        Parameters:
+        ----------
+            texts: Iterable[str]
+                Iterable of strings to combine with corresponding images to add to the vectorstore.
+            images: Iterable[str]
+                Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
+            metadatas: List[str]
+                Optional list of metadatas associated with the texts.
+            ids: List[str]
+                Optional list of ids to associate with the texts.
+        Returns:
+        --------
+            List of ids of the added text-image pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
+        print(f'The length of texts is {len(texts)}')
+        # Embed texts and create documents
+        docs = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths))  # type: ignore
+        for idx, text in enumerate(texts):
+            embedding = embeddings[idx]
+            metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
+            docs.append(
+                {
+                    self._vector_key: embedding,
+                    self._id_key: ids[idx],
+                    self._text_key: text,
+                    self._image_path_key : image_paths[idx],
+                    "metadata": metadata,
+                }
+            )
+        print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
+        if 'mode' in kwargs:
+            mode = kwargs['mode']
+        else:
+            mode = self.mode
+        if self._table_name in self._connection.table_names():
+            tbl = self._connection.open_table(self._table_name)
+            if self.api_key is None:
+                tbl.add(docs)
+            else:
+                tbl.add(docs)
+        else:
+            self._connection.create_table(self._table_name, data=docs)
+        return ids
+    @classmethod
+    def from_text_image_pairs(
+        cls,
+        texts: List[str],
+        image_paths: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        connection: Any = None,
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        **kwargs: Any,
+    ):
+        instance = MultimodalLanceDB(
+            connection=connection,
+            embedding=embedding,
+            vector_key=vector_key,
+            id_key=id_key,
+            text_key=text_key,
+            image_path_key=image_path_key,
+            table_name=table_name,
+        )
+        instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
+        return instance

src/data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/data/images/car_1.jpg ADDED Viewed

Git LFS Details

SHA256: 7093f1c09568aece012f93050a14f2c272f4f35f16d88030ccf2ac6a88d19f28
Pointer size: 132 Bytes
Size of remote file: 1.52 MB

src/data/images/car_2.jpg ADDED Viewed

Git LFS Details

SHA256: b1baf7d58f14bcbb6c0ac143c93f1b3b8972a6f544a81790e54594142888f6cc
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

src/data/images/cat_1.jpg ADDED Viewed

src/data/images/cat_2.jpg ADDED Viewed

src/data/images/cat_3.jpg ADDED Viewed

src/data/images/motorcycle_1.jpg ADDED Viewed

src/data/images/motorcycle_2.jpg ADDED Viewed

Git LFS Details

SHA256: cebf0ac3e43fa6246fc07de3d147aac3678083efa66dcb2304488d7a8754ce2e
Pointer size: 131 Bytes
Size of remote file: 114 kB

src/data/images/motorcycle_3.jpg ADDED Viewed

Git LFS Details

SHA256: 6cedc5acbaa790d4a26fe35c4bde211ec94f6e30d67f77b6324e3b5f30338048
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

src/preprocess/.ipynb_checkpoints/embedding-checkpoint.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from utils import encode_image
+from utils import bt_embeddings
+from tqdm import tqdm
+from typing import List
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import BaseModel
+class BridgeTowerEmbeddings(BaseModel,Embeddings):
+    """ BridgeTower embedding model """
+    def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
+        """Embed a list of image-text pairs using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The list of texts to embed.
+        images: List
+            The list of path-to-images to embed
+        batch_size: int
+            The batch size to process, default to 2
+        Returns:
+        --------
+            List of embeddings, one for each image-text pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(images), "the len of captions should be equal to the len of images"
+        print(f"Embedding {len(texts)} image-text pairs...")
+        embeddings = []
+        for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
+            embedding = bt_embeddings(text, encode_image(path_to_img))
+            embeddings.append(embedding)
+        return embeddings
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The list of texts to embed.
+        Returns:
+        --------
+            List of embeddings, one for each text.
+        """
+        embeddings = []
+        for text in texts:
+            embedding = bt_embeddings(text, "")
+            embeddings.append(embedding)
+        return embeddings
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The text to embed.
+        Returns:
+            Embeddings for the text.
+        """
+        return self.embed_documents([text])[0]

src/preprocess/.ipynb_checkpoints/preprocessing-checkpoint.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from os import path as osp
+import json
+import cv2
+import webvtt
+from utils import maintain_aspect_ratio_resize, str2time
+def extract_and_save_frames_and_metadata(
+        path_to_video,
+        path_to_transcript,
+        path_to_save_extracted_frames,
+        path_to_save_metadatas):
+    # metadatas will store the metadata of all extracted frames
+    metadatas = []
+    # load video using cv2
+    video = cv2.VideoCapture(path_to_video)
+    # load transcript using webvtt
+    trans = webvtt.read(path_to_transcript)
+    # iterate transcript file
+    # for each video segment specified in the transcript file
+    for idx, transcript in enumerate(trans):
+        # get the start time and end time in seconds
+        start_time_ms = str2time(transcript.start)
+        end_time_ms = str2time(transcript.end)
+        # get the time in ms exactly
+        # in the middle of start time and end time
+        mid_time_ms = (end_time_ms + start_time_ms) / 2
+        # get the transcript, remove the next-line symbol
+        text = transcript.text.replace("\n", ' ')
+        # get frame at the middle time
+        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
+        success, frame = video.read()
+        if success:
+            # if the frame is extracted successfully, resize it
+            image = maintain_aspect_ratio_resize(frame, height=350)
+            # save frame as JPEG file
+            img_fname = f'frame_{idx}.jpg'
+            img_fpath = osp.join(
+                path_to_save_extracted_frames, img_fname
+            )
+            cv2.imwrite(img_fpath, image)
+            # prepare the metadata
+            metadata = {
+                'extracted_frame_path': img_fpath,
+                'transcript': text,
+                'video_segment_id': idx,
+                'video_path': path_to_video,
+                'mid_time_ms': mid_time_ms,
+            }
+            metadatas.append(metadata)
+        else:
+            print(f"ERROR! Cannot extract frame: idx = {idx}")
+    # save metadata of all extracted frames
+    fn = osp.join(path_to_save_metadatas, 'metadatas.json')
+    with open(fn, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas

src/preprocess/__pycache__/embedding.cpython-313.pyc ADDED Viewed

Binary file (2.9 kB). View file

src/preprocess/__pycache__/preprocessing.cpython-313.pyc ADDED Viewed

Binary file (2.19 kB). View file

src/preprocess/embedding.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from utils import encode_image
+from utils import bt_embeddings
+from tqdm import tqdm
+from typing import List
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import BaseModel
+class BridgeTowerEmbeddings(BaseModel,Embeddings):
+    """ BridgeTower embedding model """
+    def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
+        """Embed a list of image-text pairs using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The list of texts to embed.
+        images: List
+            The list of path-to-images to embed
+        batch_size: int
+            The batch size to process, default to 2
+        Returns:
+        --------
+            List of embeddings, one for each image-text pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(images), "the len of captions should be equal to the len of images"
+        print(f"Embedding {len(texts)} image-text pairs...")
+        embeddings = []
+        for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
+            embedding = bt_embeddings(text, encode_image(path_to_img))
+            embeddings.append(embedding)
+        return embeddings
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The list of texts to embed.
+        Returns:
+        --------
+            List of embeddings, one for each text.
+        """
+        embeddings = []
+        for text in texts:
+            embedding = bt_embeddings(text, "")
+            embeddings.append(embedding)
+        return embeddings
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using BridgeTower.
+        Parameters:
+        -----------
+        texts: str
+            The text to embed.
+        Returns:
+            Embeddings for the text.
+        """
+        return self.embed_documents([text])[0]

src/preprocess/preprocessing.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from os import path as osp
+import json
+import cv2
+import webvtt
+from utils import maintain_aspect_ratio_resize, str2time
+def extract_and_save_frames_and_metadata(
+        path_to_video,
+        path_to_transcript,
+        path_to_save_extracted_frames,
+        path_to_save_metadatas):
+    # metadatas will store the metadata of all extracted frames
+    metadatas = []
+    # load video using cv2
+    video = cv2.VideoCapture(path_to_video)
+    # load transcript using webvtt
+    trans = webvtt.read(path_to_transcript)
+    # iterate transcript file
+    # for each video segment specified in the transcript file
+    for idx, transcript in enumerate(trans):
+        # get the start time and end time in seconds
+        start_time_ms = str2time(transcript.start)
+        end_time_ms = str2time(transcript.end)
+        # get the time in ms exactly
+        # in the middle of start time and end time
+        mid_time_ms = (end_time_ms + start_time_ms) / 2
+        # get the transcript, remove the next-line symbol
+        text = transcript.text.replace("\n", ' ')
+        # get frame at the middle time
+        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
+        success, frame = video.read()
+        if success:
+            # if the frame is extracted successfully, resize it
+            image = maintain_aspect_ratio_resize(frame, height=350)
+            # save frame as JPEG file
+            img_fname = f'frame_{idx}.jpg'
+            img_fpath = osp.join(
+                path_to_save_extracted_frames, img_fname
+            )
+            cv2.imwrite(img_fpath, image)
+            # prepare the metadata
+            metadata = {
+                'extracted_frame_path': img_fpath,
+                'transcript': text,
+                'video_segment_id': idx,
+                'video_path': path_to_video,
+                'mid_time_ms': mid_time_ms,
+            }
+            metadatas.append(metadata)
+        else:
+            print(f"ERROR! Cannot extract frame: idx = {idx}")
+    # save metadata of all extracted frames
+    fn = osp.join(path_to_save_metadatas, 'metadatas.json')
+    with open(fn, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas