Upload 18 files
Browse files- .gitattributes +4 -0
- src/crud/.ipynb_checkpoints/vector_store-checkpoint.py +140 -0
- src/crud/__pycache__/vector_store.cpython-313.pyc +0 -0
- src/crud/vector_store.py +140 -0
- src/data/.DS_Store +0 -0
- src/data/images/car_1.jpg +3 -0
- src/data/images/car_2.jpg +3 -0
- src/data/images/cat_1.jpg +0 -0
- src/data/images/cat_2.jpg +0 -0
- src/data/images/cat_3.jpg +0 -0
- src/data/images/motorcycle_1.jpg +0 -0
- src/data/images/motorcycle_2.jpg +3 -0
- src/data/images/motorcycle_3.jpg +3 -0
- src/preprocess/.ipynb_checkpoints/embedding-checkpoint.py +69 -0
- src/preprocess/.ipynb_checkpoints/preprocessing-checkpoint.py +65 -0
- src/preprocess/__pycache__/embedding.cpython-313.pyc +0 -0
- src/preprocess/__pycache__/preprocessing.cpython-313.pyc +0 -0
- src/preprocess/embedding.py +69 -0
- src/preprocess/preprocessing.py +65 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
src/data/images/car_1.jpg filter=lfs diff=lfs merge=lfs -text
|
37 |
+
src/data/images/car_2.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
+
src/data/images/motorcycle_2.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
+
src/data/images/motorcycle_3.jpg filter=lfs diff=lfs merge=lfs -text
|
src/crud/.ipynb_checkpoints/vector_store-checkpoint.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Iterable, List, Optional
|
2 |
+
from langchain_core.embeddings import Embeddings
|
3 |
+
import uuid
|
4 |
+
from langchain_community.vectorstores.lancedb import LanceDB
|
5 |
+
|
6 |
+
class MultimodalLanceDB(LanceDB):
|
7 |
+
"""`LanceDB` vector store to process multimodal data
|
8 |
+
|
9 |
+
Parameters:
|
10 |
+
-----------
|
11 |
+
connection: Any
|
12 |
+
LanceDB connection to use. If not provided, a new connection will be created.
|
13 |
+
embedding: Embeddings
|
14 |
+
Embedding to use for the vectorstore.
|
15 |
+
vector_key: str
|
16 |
+
Key to use for the vector in the database. Defaults to ``vector``.
|
17 |
+
id_key: str
|
18 |
+
Key to use for the id in the database. Defaults to ``id``.
|
19 |
+
text_key: str
|
20 |
+
Key to use for the text in the database. Defaults to ``text``.
|
21 |
+
image_path_key: str
|
22 |
+
Key to use for the path to image in the database. Defaults to ``image_path``.
|
23 |
+
table_name: str
|
24 |
+
Name of the table to use. Defaults to ``vectorstore``.
|
25 |
+
api_key: str
|
26 |
+
API key to use for LanceDB cloud database.
|
27 |
+
region: str
|
28 |
+
Region to use for LanceDB cloud database.
|
29 |
+
mode: str
|
30 |
+
Mode to use for adding data to the table. Defaults to ``overwrite``.
|
31 |
+
|
32 |
+
"""
|
33 |
+
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
connection: Optional[Any] = None,
|
37 |
+
embedding: Optional[Embeddings] = None,
|
38 |
+
uri: Optional[str] = "/tmp/lancedb",
|
39 |
+
vector_key: Optional[str] = "vector",
|
40 |
+
id_key: Optional[str] = "id",
|
41 |
+
text_key: Optional[str] = "text",
|
42 |
+
image_path_key: Optional[str] = "image_path",
|
43 |
+
table_name: Optional[str] = "vectorstore",
|
44 |
+
api_key: Optional[str] = None,
|
45 |
+
region: Optional[str] = None,
|
46 |
+
mode: Optional[str] = "append",
|
47 |
+
):
|
48 |
+
super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
|
49 |
+
self._image_path_key = image_path_key
|
50 |
+
|
51 |
+
def add_text_image_pairs(
|
52 |
+
self,
|
53 |
+
texts: Iterable[str],
|
54 |
+
image_paths: Iterable[str],
|
55 |
+
metadatas: Optional[List[dict]] = None,
|
56 |
+
ids: Optional[List[str]] = None,
|
57 |
+
**kwargs: Any,
|
58 |
+
) -> List[str]:
|
59 |
+
"""Turn text-image pairs into embedding and add it to the database
|
60 |
+
|
61 |
+
Parameters:
|
62 |
+
----------
|
63 |
+
texts: Iterable[str]
|
64 |
+
Iterable of strings to combine with corresponding images to add to the vectorstore.
|
65 |
+
images: Iterable[str]
|
66 |
+
Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
|
67 |
+
metadatas: List[str]
|
68 |
+
Optional list of metadatas associated with the texts.
|
69 |
+
ids: List[str]
|
70 |
+
Optional list of ids to associate with the texts.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
--------
|
74 |
+
List of ids of the added text-image pairs.
|
75 |
+
"""
|
76 |
+
# the length of texts must be equal to the length of images
|
77 |
+
assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
|
78 |
+
|
79 |
+
print(f'The length of texts is {len(texts)}')
|
80 |
+
|
81 |
+
# Embed texts and create documents
|
82 |
+
docs = []
|
83 |
+
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
84 |
+
embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths)) # type: ignore
|
85 |
+
for idx, text in enumerate(texts):
|
86 |
+
embedding = embeddings[idx]
|
87 |
+
metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
|
88 |
+
docs.append(
|
89 |
+
{
|
90 |
+
self._vector_key: embedding,
|
91 |
+
self._id_key: ids[idx],
|
92 |
+
self._text_key: text,
|
93 |
+
self._image_path_key : image_paths[idx],
|
94 |
+
"metadata": metadata,
|
95 |
+
}
|
96 |
+
)
|
97 |
+
print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
|
98 |
+
|
99 |
+
if 'mode' in kwargs:
|
100 |
+
mode = kwargs['mode']
|
101 |
+
else:
|
102 |
+
mode = self.mode
|
103 |
+
if self._table_name in self._connection.table_names():
|
104 |
+
tbl = self._connection.open_table(self._table_name)
|
105 |
+
if self.api_key is None:
|
106 |
+
tbl.add(docs)
|
107 |
+
else:
|
108 |
+
tbl.add(docs)
|
109 |
+
else:
|
110 |
+
self._connection.create_table(self._table_name, data=docs)
|
111 |
+
return ids
|
112 |
+
|
113 |
+
@classmethod
|
114 |
+
def from_text_image_pairs(
|
115 |
+
cls,
|
116 |
+
texts: List[str],
|
117 |
+
image_paths: List[str],
|
118 |
+
embedding: Embeddings,
|
119 |
+
metadatas: Optional[List[dict]] = None,
|
120 |
+
connection: Any = None,
|
121 |
+
vector_key: Optional[str] = "vector",
|
122 |
+
id_key: Optional[str] = "id",
|
123 |
+
text_key: Optional[str] = "text",
|
124 |
+
image_path_key: Optional[str] = "image_path",
|
125 |
+
table_name: Optional[str] = "vectorstore",
|
126 |
+
**kwargs: Any,
|
127 |
+
):
|
128 |
+
|
129 |
+
instance = MultimodalLanceDB(
|
130 |
+
connection=connection,
|
131 |
+
embedding=embedding,
|
132 |
+
vector_key=vector_key,
|
133 |
+
id_key=id_key,
|
134 |
+
text_key=text_key,
|
135 |
+
image_path_key=image_path_key,
|
136 |
+
table_name=table_name,
|
137 |
+
)
|
138 |
+
instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
|
139 |
+
|
140 |
+
return instance
|
src/crud/__pycache__/vector_store.cpython-313.pyc
ADDED
Binary file (6.15 kB). View file
|
|
src/crud/vector_store.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Iterable, List, Optional
|
2 |
+
from langchain_core.embeddings import Embeddings
|
3 |
+
import uuid
|
4 |
+
from langchain_community.vectorstores.lancedb import LanceDB
|
5 |
+
|
6 |
+
class MultimodalLanceDB(LanceDB):
|
7 |
+
"""`LanceDB` vector store to process multimodal data
|
8 |
+
|
9 |
+
Parameters:
|
10 |
+
-----------
|
11 |
+
connection: Any
|
12 |
+
LanceDB connection to use. If not provided, a new connection will be created.
|
13 |
+
embedding: Embeddings
|
14 |
+
Embedding to use for the vectorstore.
|
15 |
+
vector_key: str
|
16 |
+
Key to use for the vector in the database. Defaults to ``vector``.
|
17 |
+
id_key: str
|
18 |
+
Key to use for the id in the database. Defaults to ``id``.
|
19 |
+
text_key: str
|
20 |
+
Key to use for the text in the database. Defaults to ``text``.
|
21 |
+
image_path_key: str
|
22 |
+
Key to use for the path to image in the database. Defaults to ``image_path``.
|
23 |
+
table_name: str
|
24 |
+
Name of the table to use. Defaults to ``vectorstore``.
|
25 |
+
api_key: str
|
26 |
+
API key to use for LanceDB cloud database.
|
27 |
+
region: str
|
28 |
+
Region to use for LanceDB cloud database.
|
29 |
+
mode: str
|
30 |
+
Mode to use for adding data to the table. Defaults to ``overwrite``.
|
31 |
+
|
32 |
+
"""
|
33 |
+
|
34 |
+
def __init__(
|
35 |
+
self,
|
36 |
+
connection: Optional[Any] = None,
|
37 |
+
embedding: Optional[Embeddings] = None,
|
38 |
+
uri: Optional[str] = "/tmp/lancedb",
|
39 |
+
vector_key: Optional[str] = "vector",
|
40 |
+
id_key: Optional[str] = "id",
|
41 |
+
text_key: Optional[str] = "text",
|
42 |
+
image_path_key: Optional[str] = "image_path",
|
43 |
+
table_name: Optional[str] = "vectorstore",
|
44 |
+
api_key: Optional[str] = None,
|
45 |
+
region: Optional[str] = None,
|
46 |
+
mode: Optional[str] = "append",
|
47 |
+
):
|
48 |
+
super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
|
49 |
+
self._image_path_key = image_path_key
|
50 |
+
|
51 |
+
def add_text_image_pairs(
|
52 |
+
self,
|
53 |
+
texts: Iterable[str],
|
54 |
+
image_paths: Iterable[str],
|
55 |
+
metadatas: Optional[List[dict]] = None,
|
56 |
+
ids: Optional[List[str]] = None,
|
57 |
+
**kwargs: Any,
|
58 |
+
) -> List[str]:
|
59 |
+
"""Turn text-image pairs into embedding and add it to the database
|
60 |
+
|
61 |
+
Parameters:
|
62 |
+
----------
|
63 |
+
texts: Iterable[str]
|
64 |
+
Iterable of strings to combine with corresponding images to add to the vectorstore.
|
65 |
+
images: Iterable[str]
|
66 |
+
Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
|
67 |
+
metadatas: List[str]
|
68 |
+
Optional list of metadatas associated with the texts.
|
69 |
+
ids: List[str]
|
70 |
+
Optional list of ids to associate with the texts.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
--------
|
74 |
+
List of ids of the added text-image pairs.
|
75 |
+
"""
|
76 |
+
# the length of texts must be equal to the length of images
|
77 |
+
assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
|
78 |
+
|
79 |
+
print(f'The length of texts is {len(texts)}')
|
80 |
+
|
81 |
+
# Embed texts and create documents
|
82 |
+
docs = []
|
83 |
+
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
84 |
+
embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths)) # type: ignore
|
85 |
+
for idx, text in enumerate(texts):
|
86 |
+
embedding = embeddings[idx]
|
87 |
+
metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
|
88 |
+
docs.append(
|
89 |
+
{
|
90 |
+
self._vector_key: embedding,
|
91 |
+
self._id_key: ids[idx],
|
92 |
+
self._text_key: text,
|
93 |
+
self._image_path_key : image_paths[idx],
|
94 |
+
"metadata": metadata,
|
95 |
+
}
|
96 |
+
)
|
97 |
+
print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
|
98 |
+
|
99 |
+
if 'mode' in kwargs:
|
100 |
+
mode = kwargs['mode']
|
101 |
+
else:
|
102 |
+
mode = self.mode
|
103 |
+
if self._table_name in self._connection.table_names():
|
104 |
+
tbl = self._connection.open_table(self._table_name)
|
105 |
+
if self.api_key is None:
|
106 |
+
tbl.add(docs)
|
107 |
+
else:
|
108 |
+
tbl.add(docs)
|
109 |
+
else:
|
110 |
+
self._connection.create_table(self._table_name, data=docs)
|
111 |
+
return ids
|
112 |
+
|
113 |
+
@classmethod
|
114 |
+
def from_text_image_pairs(
|
115 |
+
cls,
|
116 |
+
texts: List[str],
|
117 |
+
image_paths: List[str],
|
118 |
+
embedding: Embeddings,
|
119 |
+
metadatas: Optional[List[dict]] = None,
|
120 |
+
connection: Any = None,
|
121 |
+
vector_key: Optional[str] = "vector",
|
122 |
+
id_key: Optional[str] = "id",
|
123 |
+
text_key: Optional[str] = "text",
|
124 |
+
image_path_key: Optional[str] = "image_path",
|
125 |
+
table_name: Optional[str] = "vectorstore",
|
126 |
+
**kwargs: Any,
|
127 |
+
):
|
128 |
+
|
129 |
+
instance = MultimodalLanceDB(
|
130 |
+
connection=connection,
|
131 |
+
embedding=embedding,
|
132 |
+
vector_key=vector_key,
|
133 |
+
id_key=id_key,
|
134 |
+
text_key=text_key,
|
135 |
+
image_path_key=image_path_key,
|
136 |
+
table_name=table_name,
|
137 |
+
)
|
138 |
+
instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
|
139 |
+
|
140 |
+
return instance
|
src/data/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/data/images/car_1.jpg
ADDED
![]() |
Git LFS Details
|
src/data/images/car_2.jpg
ADDED
![]() |
Git LFS Details
|
src/data/images/cat_1.jpg
ADDED
![]() |
src/data/images/cat_2.jpg
ADDED
![]() |
src/data/images/cat_3.jpg
ADDED
![]() |
src/data/images/motorcycle_1.jpg
ADDED
![]() |
src/data/images/motorcycle_2.jpg
ADDED
![]() |
Git LFS Details
|
src/data/images/motorcycle_3.jpg
ADDED
![]() |
Git LFS Details
|
src/preprocess/.ipynb_checkpoints/embedding-checkpoint.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils import encode_image
|
2 |
+
from utils import bt_embeddings
|
3 |
+
from tqdm import tqdm
|
4 |
+
from typing import List
|
5 |
+
from langchain_core.embeddings import Embeddings
|
6 |
+
from langchain_core.pydantic_v1 import BaseModel
|
7 |
+
|
8 |
+
class BridgeTowerEmbeddings(BaseModel,Embeddings):
|
9 |
+
""" BridgeTower embedding model """
|
10 |
+
|
11 |
+
def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
|
12 |
+
"""Embed a list of image-text pairs using BridgeTower.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
-----------
|
16 |
+
texts: str
|
17 |
+
The list of texts to embed.
|
18 |
+
images: List
|
19 |
+
The list of path-to-images to embed
|
20 |
+
batch_size: int
|
21 |
+
The batch size to process, default to 2
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
--------
|
25 |
+
List of embeddings, one for each image-text pairs.
|
26 |
+
"""
|
27 |
+
|
28 |
+
# the length of texts must be equal to the length of images
|
29 |
+
assert len(texts)==len(images), "the len of captions should be equal to the len of images"
|
30 |
+
|
31 |
+
print(f"Embedding {len(texts)} image-text pairs...")
|
32 |
+
|
33 |
+
embeddings = []
|
34 |
+
for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
|
35 |
+
embedding = bt_embeddings(text, encode_image(path_to_img))
|
36 |
+
embeddings.append(embedding)
|
37 |
+
return embeddings
|
38 |
+
|
39 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
40 |
+
"""Embed a list of documents using BridgeTower.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
-----------
|
44 |
+
texts: str
|
45 |
+
The list of texts to embed.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
--------
|
49 |
+
List of embeddings, one for each text.
|
50 |
+
"""
|
51 |
+
embeddings = []
|
52 |
+
for text in texts:
|
53 |
+
|
54 |
+
embedding = bt_embeddings(text, "")
|
55 |
+
embeddings.append(embedding)
|
56 |
+
return embeddings
|
57 |
+
|
58 |
+
def embed_query(self, text: str) -> List[float]:
|
59 |
+
"""Embed a query using BridgeTower.
|
60 |
+
|
61 |
+
Parameters:
|
62 |
+
-----------
|
63 |
+
texts: str
|
64 |
+
The text to embed.
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
Embeddings for the text.
|
68 |
+
"""
|
69 |
+
return self.embed_documents([text])[0]
|
src/preprocess/.ipynb_checkpoints/preprocessing-checkpoint.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import path as osp
|
2 |
+
import json
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import webvtt
|
6 |
+
|
7 |
+
from utils import maintain_aspect_ratio_resize, str2time
|
8 |
+
|
9 |
+
def extract_and_save_frames_and_metadata(
|
10 |
+
path_to_video,
|
11 |
+
path_to_transcript,
|
12 |
+
path_to_save_extracted_frames,
|
13 |
+
path_to_save_metadatas):
|
14 |
+
|
15 |
+
# metadatas will store the metadata of all extracted frames
|
16 |
+
metadatas = []
|
17 |
+
|
18 |
+
# load video using cv2
|
19 |
+
video = cv2.VideoCapture(path_to_video)
|
20 |
+
# load transcript using webvtt
|
21 |
+
trans = webvtt.read(path_to_transcript)
|
22 |
+
|
23 |
+
# iterate transcript file
|
24 |
+
# for each video segment specified in the transcript file
|
25 |
+
for idx, transcript in enumerate(trans):
|
26 |
+
|
27 |
+
# get the start time and end time in seconds
|
28 |
+
start_time_ms = str2time(transcript.start)
|
29 |
+
end_time_ms = str2time(transcript.end)
|
30 |
+
# get the time in ms exactly
|
31 |
+
# in the middle of start time and end time
|
32 |
+
mid_time_ms = (end_time_ms + start_time_ms) / 2
|
33 |
+
# get the transcript, remove the next-line symbol
|
34 |
+
text = transcript.text.replace("\n", ' ')
|
35 |
+
# get frame at the middle time
|
36 |
+
video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
|
37 |
+
success, frame = video.read()
|
38 |
+
if success:
|
39 |
+
# if the frame is extracted successfully, resize it
|
40 |
+
image = maintain_aspect_ratio_resize(frame, height=350)
|
41 |
+
# save frame as JPEG file
|
42 |
+
img_fname = f'frame_{idx}.jpg'
|
43 |
+
img_fpath = osp.join(
|
44 |
+
path_to_save_extracted_frames, img_fname
|
45 |
+
)
|
46 |
+
cv2.imwrite(img_fpath, image)
|
47 |
+
|
48 |
+
# prepare the metadata
|
49 |
+
metadata = {
|
50 |
+
'extracted_frame_path': img_fpath,
|
51 |
+
'transcript': text,
|
52 |
+
'video_segment_id': idx,
|
53 |
+
'video_path': path_to_video,
|
54 |
+
'mid_time_ms': mid_time_ms,
|
55 |
+
}
|
56 |
+
metadatas.append(metadata)
|
57 |
+
|
58 |
+
else:
|
59 |
+
print(f"ERROR! Cannot extract frame: idx = {idx}")
|
60 |
+
|
61 |
+
# save metadata of all extracted frames
|
62 |
+
fn = osp.join(path_to_save_metadatas, 'metadatas.json')
|
63 |
+
with open(fn, 'w') as outfile:
|
64 |
+
json.dump(metadatas, outfile)
|
65 |
+
return metadatas
|
src/preprocess/__pycache__/embedding.cpython-313.pyc
ADDED
Binary file (2.9 kB). View file
|
|
src/preprocess/__pycache__/preprocessing.cpython-313.pyc
ADDED
Binary file (2.19 kB). View file
|
|
src/preprocess/embedding.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils import encode_image
|
2 |
+
from utils import bt_embeddings
|
3 |
+
from tqdm import tqdm
|
4 |
+
from typing import List
|
5 |
+
from langchain_core.embeddings import Embeddings
|
6 |
+
from langchain_core.pydantic_v1 import BaseModel
|
7 |
+
|
8 |
+
class BridgeTowerEmbeddings(BaseModel,Embeddings):
|
9 |
+
""" BridgeTower embedding model """
|
10 |
+
|
11 |
+
def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
|
12 |
+
"""Embed a list of image-text pairs using BridgeTower.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
-----------
|
16 |
+
texts: str
|
17 |
+
The list of texts to embed.
|
18 |
+
images: List
|
19 |
+
The list of path-to-images to embed
|
20 |
+
batch_size: int
|
21 |
+
The batch size to process, default to 2
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
--------
|
25 |
+
List of embeddings, one for each image-text pairs.
|
26 |
+
"""
|
27 |
+
|
28 |
+
# the length of texts must be equal to the length of images
|
29 |
+
assert len(texts)==len(images), "the len of captions should be equal to the len of images"
|
30 |
+
|
31 |
+
print(f"Embedding {len(texts)} image-text pairs...")
|
32 |
+
|
33 |
+
embeddings = []
|
34 |
+
for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
|
35 |
+
embedding = bt_embeddings(text, encode_image(path_to_img))
|
36 |
+
embeddings.append(embedding)
|
37 |
+
return embeddings
|
38 |
+
|
39 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
40 |
+
"""Embed a list of documents using BridgeTower.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
-----------
|
44 |
+
texts: str
|
45 |
+
The list of texts to embed.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
--------
|
49 |
+
List of embeddings, one for each text.
|
50 |
+
"""
|
51 |
+
embeddings = []
|
52 |
+
for text in texts:
|
53 |
+
|
54 |
+
embedding = bt_embeddings(text, "")
|
55 |
+
embeddings.append(embedding)
|
56 |
+
return embeddings
|
57 |
+
|
58 |
+
def embed_query(self, text: str) -> List[float]:
|
59 |
+
"""Embed a query using BridgeTower.
|
60 |
+
|
61 |
+
Parameters:
|
62 |
+
-----------
|
63 |
+
texts: str
|
64 |
+
The text to embed.
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
Embeddings for the text.
|
68 |
+
"""
|
69 |
+
return self.embed_documents([text])[0]
|
src/preprocess/preprocessing.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import path as osp
|
2 |
+
import json
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import webvtt
|
6 |
+
|
7 |
+
from utils import maintain_aspect_ratio_resize, str2time
|
8 |
+
|
9 |
+
def extract_and_save_frames_and_metadata(
|
10 |
+
path_to_video,
|
11 |
+
path_to_transcript,
|
12 |
+
path_to_save_extracted_frames,
|
13 |
+
path_to_save_metadatas):
|
14 |
+
|
15 |
+
# metadatas will store the metadata of all extracted frames
|
16 |
+
metadatas = []
|
17 |
+
|
18 |
+
# load video using cv2
|
19 |
+
video = cv2.VideoCapture(path_to_video)
|
20 |
+
# load transcript using webvtt
|
21 |
+
trans = webvtt.read(path_to_transcript)
|
22 |
+
|
23 |
+
# iterate transcript file
|
24 |
+
# for each video segment specified in the transcript file
|
25 |
+
for idx, transcript in enumerate(trans):
|
26 |
+
|
27 |
+
# get the start time and end time in seconds
|
28 |
+
start_time_ms = str2time(transcript.start)
|
29 |
+
end_time_ms = str2time(transcript.end)
|
30 |
+
# get the time in ms exactly
|
31 |
+
# in the middle of start time and end time
|
32 |
+
mid_time_ms = (end_time_ms + start_time_ms) / 2
|
33 |
+
# get the transcript, remove the next-line symbol
|
34 |
+
text = transcript.text.replace("\n", ' ')
|
35 |
+
# get frame at the middle time
|
36 |
+
video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
|
37 |
+
success, frame = video.read()
|
38 |
+
if success:
|
39 |
+
# if the frame is extracted successfully, resize it
|
40 |
+
image = maintain_aspect_ratio_resize(frame, height=350)
|
41 |
+
# save frame as JPEG file
|
42 |
+
img_fname = f'frame_{idx}.jpg'
|
43 |
+
img_fpath = osp.join(
|
44 |
+
path_to_save_extracted_frames, img_fname
|
45 |
+
)
|
46 |
+
cv2.imwrite(img_fpath, image)
|
47 |
+
|
48 |
+
# prepare the metadata
|
49 |
+
metadata = {
|
50 |
+
'extracted_frame_path': img_fpath,
|
51 |
+
'transcript': text,
|
52 |
+
'video_segment_id': idx,
|
53 |
+
'video_path': path_to_video,
|
54 |
+
'mid_time_ms': mid_time_ms,
|
55 |
+
}
|
56 |
+
metadatas.append(metadata)
|
57 |
+
|
58 |
+
else:
|
59 |
+
print(f"ERROR! Cannot extract frame: idx = {idx}")
|
60 |
+
|
61 |
+
# save metadata of all extracted frames
|
62 |
+
fn = osp.join(path_to_save_metadatas, 'metadatas.json')
|
63 |
+
with open(fn, 'w') as outfile:
|
64 |
+
json.dump(metadatas, outfile)
|
65 |
+
return metadatas
|