doggdad commited on
Commit
21cbf97
·
verified ·
1 Parent(s): c4a0945

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/data/images/car_1.jpg filter=lfs diff=lfs merge=lfs -text
37
+ src/data/images/car_2.jpg filter=lfs diff=lfs merge=lfs -text
38
+ src/data/images/motorcycle_2.jpg filter=lfs diff=lfs merge=lfs -text
39
+ src/data/images/motorcycle_3.jpg filter=lfs diff=lfs merge=lfs -text
src/crud/.ipynb_checkpoints/vector_store-checkpoint.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Iterable, List, Optional
2
+ from langchain_core.embeddings import Embeddings
3
+ import uuid
4
+ from langchain_community.vectorstores.lancedb import LanceDB
5
+
6
+ class MultimodalLanceDB(LanceDB):
7
+ """`LanceDB` vector store to process multimodal data
8
+
9
+ Parameters:
10
+ -----------
11
+ connection: Any
12
+ LanceDB connection to use. If not provided, a new connection will be created.
13
+ embedding: Embeddings
14
+ Embedding to use for the vectorstore.
15
+ vector_key: str
16
+ Key to use for the vector in the database. Defaults to ``vector``.
17
+ id_key: str
18
+ Key to use for the id in the database. Defaults to ``id``.
19
+ text_key: str
20
+ Key to use for the text in the database. Defaults to ``text``.
21
+ image_path_key: str
22
+ Key to use for the path to image in the database. Defaults to ``image_path``.
23
+ table_name: str
24
+ Name of the table to use. Defaults to ``vectorstore``.
25
+ api_key: str
26
+ API key to use for LanceDB cloud database.
27
+ region: str
28
+ Region to use for LanceDB cloud database.
29
+ mode: str
30
+ Mode to use for adding data to the table. Defaults to ``overwrite``.
31
+
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ connection: Optional[Any] = None,
37
+ embedding: Optional[Embeddings] = None,
38
+ uri: Optional[str] = "/tmp/lancedb",
39
+ vector_key: Optional[str] = "vector",
40
+ id_key: Optional[str] = "id",
41
+ text_key: Optional[str] = "text",
42
+ image_path_key: Optional[str] = "image_path",
43
+ table_name: Optional[str] = "vectorstore",
44
+ api_key: Optional[str] = None,
45
+ region: Optional[str] = None,
46
+ mode: Optional[str] = "append",
47
+ ):
48
+ super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
49
+ self._image_path_key = image_path_key
50
+
51
+ def add_text_image_pairs(
52
+ self,
53
+ texts: Iterable[str],
54
+ image_paths: Iterable[str],
55
+ metadatas: Optional[List[dict]] = None,
56
+ ids: Optional[List[str]] = None,
57
+ **kwargs: Any,
58
+ ) -> List[str]:
59
+ """Turn text-image pairs into embedding and add it to the database
60
+
61
+ Parameters:
62
+ ----------
63
+ texts: Iterable[str]
64
+ Iterable of strings to combine with corresponding images to add to the vectorstore.
65
+ images: Iterable[str]
66
+ Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
67
+ metadatas: List[str]
68
+ Optional list of metadatas associated with the texts.
69
+ ids: List[str]
70
+ Optional list of ids to associate with the texts.
71
+
72
+ Returns:
73
+ --------
74
+ List of ids of the added text-image pairs.
75
+ """
76
+ # the length of texts must be equal to the length of images
77
+ assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
78
+
79
+ print(f'The length of texts is {len(texts)}')
80
+
81
+ # Embed texts and create documents
82
+ docs = []
83
+ ids = ids or [str(uuid.uuid4()) for _ in texts]
84
+ embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths)) # type: ignore
85
+ for idx, text in enumerate(texts):
86
+ embedding = embeddings[idx]
87
+ metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
88
+ docs.append(
89
+ {
90
+ self._vector_key: embedding,
91
+ self._id_key: ids[idx],
92
+ self._text_key: text,
93
+ self._image_path_key : image_paths[idx],
94
+ "metadata": metadata,
95
+ }
96
+ )
97
+ print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
98
+
99
+ if 'mode' in kwargs:
100
+ mode = kwargs['mode']
101
+ else:
102
+ mode = self.mode
103
+ if self._table_name in self._connection.table_names():
104
+ tbl = self._connection.open_table(self._table_name)
105
+ if self.api_key is None:
106
+ tbl.add(docs)
107
+ else:
108
+ tbl.add(docs)
109
+ else:
110
+ self._connection.create_table(self._table_name, data=docs)
111
+ return ids
112
+
113
+ @classmethod
114
+ def from_text_image_pairs(
115
+ cls,
116
+ texts: List[str],
117
+ image_paths: List[str],
118
+ embedding: Embeddings,
119
+ metadatas: Optional[List[dict]] = None,
120
+ connection: Any = None,
121
+ vector_key: Optional[str] = "vector",
122
+ id_key: Optional[str] = "id",
123
+ text_key: Optional[str] = "text",
124
+ image_path_key: Optional[str] = "image_path",
125
+ table_name: Optional[str] = "vectorstore",
126
+ **kwargs: Any,
127
+ ):
128
+
129
+ instance = MultimodalLanceDB(
130
+ connection=connection,
131
+ embedding=embedding,
132
+ vector_key=vector_key,
133
+ id_key=id_key,
134
+ text_key=text_key,
135
+ image_path_key=image_path_key,
136
+ table_name=table_name,
137
+ )
138
+ instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
139
+
140
+ return instance
src/crud/__pycache__/vector_store.cpython-313.pyc ADDED
Binary file (6.15 kB). View file
 
src/crud/vector_store.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Iterable, List, Optional
2
+ from langchain_core.embeddings import Embeddings
3
+ import uuid
4
+ from langchain_community.vectorstores.lancedb import LanceDB
5
+
6
+ class MultimodalLanceDB(LanceDB):
7
+ """`LanceDB` vector store to process multimodal data
8
+
9
+ Parameters:
10
+ -----------
11
+ connection: Any
12
+ LanceDB connection to use. If not provided, a new connection will be created.
13
+ embedding: Embeddings
14
+ Embedding to use for the vectorstore.
15
+ vector_key: str
16
+ Key to use for the vector in the database. Defaults to ``vector``.
17
+ id_key: str
18
+ Key to use for the id in the database. Defaults to ``id``.
19
+ text_key: str
20
+ Key to use for the text in the database. Defaults to ``text``.
21
+ image_path_key: str
22
+ Key to use for the path to image in the database. Defaults to ``image_path``.
23
+ table_name: str
24
+ Name of the table to use. Defaults to ``vectorstore``.
25
+ api_key: str
26
+ API key to use for LanceDB cloud database.
27
+ region: str
28
+ Region to use for LanceDB cloud database.
29
+ mode: str
30
+ Mode to use for adding data to the table. Defaults to ``overwrite``.
31
+
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ connection: Optional[Any] = None,
37
+ embedding: Optional[Embeddings] = None,
38
+ uri: Optional[str] = "/tmp/lancedb",
39
+ vector_key: Optional[str] = "vector",
40
+ id_key: Optional[str] = "id",
41
+ text_key: Optional[str] = "text",
42
+ image_path_key: Optional[str] = "image_path",
43
+ table_name: Optional[str] = "vectorstore",
44
+ api_key: Optional[str] = None,
45
+ region: Optional[str] = None,
46
+ mode: Optional[str] = "append",
47
+ ):
48
+ super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
49
+ self._image_path_key = image_path_key
50
+
51
+ def add_text_image_pairs(
52
+ self,
53
+ texts: Iterable[str],
54
+ image_paths: Iterable[str],
55
+ metadatas: Optional[List[dict]] = None,
56
+ ids: Optional[List[str]] = None,
57
+ **kwargs: Any,
58
+ ) -> List[str]:
59
+ """Turn text-image pairs into embedding and add it to the database
60
+
61
+ Parameters:
62
+ ----------
63
+ texts: Iterable[str]
64
+ Iterable of strings to combine with corresponding images to add to the vectorstore.
65
+ images: Iterable[str]
66
+ Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
67
+ metadatas: List[str]
68
+ Optional list of metadatas associated with the texts.
69
+ ids: List[str]
70
+ Optional list of ids to associate with the texts.
71
+
72
+ Returns:
73
+ --------
74
+ List of ids of the added text-image pairs.
75
+ """
76
+ # the length of texts must be equal to the length of images
77
+ assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
78
+
79
+ print(f'The length of texts is {len(texts)}')
80
+
81
+ # Embed texts and create documents
82
+ docs = []
83
+ ids = ids or [str(uuid.uuid4()) for _ in texts]
84
+ embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths)) # type: ignore
85
+ for idx, text in enumerate(texts):
86
+ embedding = embeddings[idx]
87
+ metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
88
+ docs.append(
89
+ {
90
+ self._vector_key: embedding,
91
+ self._id_key: ids[idx],
92
+ self._text_key: text,
93
+ self._image_path_key : image_paths[idx],
94
+ "metadata": metadata,
95
+ }
96
+ )
97
+ print(f'Adding {len(docs)} text-image pairs to the vectorstore...')
98
+
99
+ if 'mode' in kwargs:
100
+ mode = kwargs['mode']
101
+ else:
102
+ mode = self.mode
103
+ if self._table_name in self._connection.table_names():
104
+ tbl = self._connection.open_table(self._table_name)
105
+ if self.api_key is None:
106
+ tbl.add(docs)
107
+ else:
108
+ tbl.add(docs)
109
+ else:
110
+ self._connection.create_table(self._table_name, data=docs)
111
+ return ids
112
+
113
+ @classmethod
114
+ def from_text_image_pairs(
115
+ cls,
116
+ texts: List[str],
117
+ image_paths: List[str],
118
+ embedding: Embeddings,
119
+ metadatas: Optional[List[dict]] = None,
120
+ connection: Any = None,
121
+ vector_key: Optional[str] = "vector",
122
+ id_key: Optional[str] = "id",
123
+ text_key: Optional[str] = "text",
124
+ image_path_key: Optional[str] = "image_path",
125
+ table_name: Optional[str] = "vectorstore",
126
+ **kwargs: Any,
127
+ ):
128
+
129
+ instance = MultimodalLanceDB(
130
+ connection=connection,
131
+ embedding=embedding,
132
+ vector_key=vector_key,
133
+ id_key=id_key,
134
+ text_key=text_key,
135
+ image_path_key=image_path_key,
136
+ table_name=table_name,
137
+ )
138
+ instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
139
+
140
+ return instance
src/data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/data/images/car_1.jpg ADDED

Git LFS Details

  • SHA256: 7093f1c09568aece012f93050a14f2c272f4f35f16d88030ccf2ac6a88d19f28
  • Pointer size: 132 Bytes
  • Size of remote file: 1.52 MB
src/data/images/car_2.jpg ADDED

Git LFS Details

  • SHA256: b1baf7d58f14bcbb6c0ac143c93f1b3b8972a6f544a81790e54594142888f6cc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB
src/data/images/cat_1.jpg ADDED
src/data/images/cat_2.jpg ADDED
src/data/images/cat_3.jpg ADDED
src/data/images/motorcycle_1.jpg ADDED
src/data/images/motorcycle_2.jpg ADDED

Git LFS Details

  • SHA256: cebf0ac3e43fa6246fc07de3d147aac3678083efa66dcb2304488d7a8754ce2e
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
src/data/images/motorcycle_3.jpg ADDED

Git LFS Details

  • SHA256: 6cedc5acbaa790d4a26fe35c4bde211ec94f6e30d67f77b6324e3b5f30338048
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
src/preprocess/.ipynb_checkpoints/embedding-checkpoint.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import encode_image
2
+ from utils import bt_embeddings
3
+ from tqdm import tqdm
4
+ from typing import List
5
+ from langchain_core.embeddings import Embeddings
6
+ from langchain_core.pydantic_v1 import BaseModel
7
+
8
+ class BridgeTowerEmbeddings(BaseModel,Embeddings):
9
+ """ BridgeTower embedding model """
10
+
11
+ def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
12
+ """Embed a list of image-text pairs using BridgeTower.
13
+
14
+ Parameters:
15
+ -----------
16
+ texts: str
17
+ The list of texts to embed.
18
+ images: List
19
+ The list of path-to-images to embed
20
+ batch_size: int
21
+ The batch size to process, default to 2
22
+
23
+ Returns:
24
+ --------
25
+ List of embeddings, one for each image-text pairs.
26
+ """
27
+
28
+ # the length of texts must be equal to the length of images
29
+ assert len(texts)==len(images), "the len of captions should be equal to the len of images"
30
+
31
+ print(f"Embedding {len(texts)} image-text pairs...")
32
+
33
+ embeddings = []
34
+ for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
35
+ embedding = bt_embeddings(text, encode_image(path_to_img))
36
+ embeddings.append(embedding)
37
+ return embeddings
38
+
39
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
40
+ """Embed a list of documents using BridgeTower.
41
+
42
+ Parameters:
43
+ -----------
44
+ texts: str
45
+ The list of texts to embed.
46
+
47
+ Returns:
48
+ --------
49
+ List of embeddings, one for each text.
50
+ """
51
+ embeddings = []
52
+ for text in texts:
53
+
54
+ embedding = bt_embeddings(text, "")
55
+ embeddings.append(embedding)
56
+ return embeddings
57
+
58
+ def embed_query(self, text: str) -> List[float]:
59
+ """Embed a query using BridgeTower.
60
+
61
+ Parameters:
62
+ -----------
63
+ texts: str
64
+ The text to embed.
65
+
66
+ Returns:
67
+ Embeddings for the text.
68
+ """
69
+ return self.embed_documents([text])[0]
src/preprocess/.ipynb_checkpoints/preprocessing-checkpoint.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path as osp
2
+ import json
3
+
4
+ import cv2
5
+ import webvtt
6
+
7
+ from utils import maintain_aspect_ratio_resize, str2time
8
+
9
+ def extract_and_save_frames_and_metadata(
10
+ path_to_video,
11
+ path_to_transcript,
12
+ path_to_save_extracted_frames,
13
+ path_to_save_metadatas):
14
+
15
+ # metadatas will store the metadata of all extracted frames
16
+ metadatas = []
17
+
18
+ # load video using cv2
19
+ video = cv2.VideoCapture(path_to_video)
20
+ # load transcript using webvtt
21
+ trans = webvtt.read(path_to_transcript)
22
+
23
+ # iterate transcript file
24
+ # for each video segment specified in the transcript file
25
+ for idx, transcript in enumerate(trans):
26
+
27
+ # get the start time and end time in seconds
28
+ start_time_ms = str2time(transcript.start)
29
+ end_time_ms = str2time(transcript.end)
30
+ # get the time in ms exactly
31
+ # in the middle of start time and end time
32
+ mid_time_ms = (end_time_ms + start_time_ms) / 2
33
+ # get the transcript, remove the next-line symbol
34
+ text = transcript.text.replace("\n", ' ')
35
+ # get frame at the middle time
36
+ video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
37
+ success, frame = video.read()
38
+ if success:
39
+ # if the frame is extracted successfully, resize it
40
+ image = maintain_aspect_ratio_resize(frame, height=350)
41
+ # save frame as JPEG file
42
+ img_fname = f'frame_{idx}.jpg'
43
+ img_fpath = osp.join(
44
+ path_to_save_extracted_frames, img_fname
45
+ )
46
+ cv2.imwrite(img_fpath, image)
47
+
48
+ # prepare the metadata
49
+ metadata = {
50
+ 'extracted_frame_path': img_fpath,
51
+ 'transcript': text,
52
+ 'video_segment_id': idx,
53
+ 'video_path': path_to_video,
54
+ 'mid_time_ms': mid_time_ms,
55
+ }
56
+ metadatas.append(metadata)
57
+
58
+ else:
59
+ print(f"ERROR! Cannot extract frame: idx = {idx}")
60
+
61
+ # save metadata of all extracted frames
62
+ fn = osp.join(path_to_save_metadatas, 'metadatas.json')
63
+ with open(fn, 'w') as outfile:
64
+ json.dump(metadatas, outfile)
65
+ return metadatas
src/preprocess/__pycache__/embedding.cpython-313.pyc ADDED
Binary file (2.9 kB). View file
 
src/preprocess/__pycache__/preprocessing.cpython-313.pyc ADDED
Binary file (2.19 kB). View file
 
src/preprocess/embedding.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import encode_image
2
+ from utils import bt_embeddings
3
+ from tqdm import tqdm
4
+ from typing import List
5
+ from langchain_core.embeddings import Embeddings
6
+ from langchain_core.pydantic_v1 import BaseModel
7
+
8
+ class BridgeTowerEmbeddings(BaseModel,Embeddings):
9
+ """ BridgeTower embedding model """
10
+
11
+ def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
12
+ """Embed a list of image-text pairs using BridgeTower.
13
+
14
+ Parameters:
15
+ -----------
16
+ texts: str
17
+ The list of texts to embed.
18
+ images: List
19
+ The list of path-to-images to embed
20
+ batch_size: int
21
+ The batch size to process, default to 2
22
+
23
+ Returns:
24
+ --------
25
+ List of embeddings, one for each image-text pairs.
26
+ """
27
+
28
+ # the length of texts must be equal to the length of images
29
+ assert len(texts)==len(images), "the len of captions should be equal to the len of images"
30
+
31
+ print(f"Embedding {len(texts)} image-text pairs...")
32
+
33
+ embeddings = []
34
+ for path_to_img, text in tqdm(zip(images, texts), total=len(images), desc="Processing pairs"):
35
+ embedding = bt_embeddings(text, encode_image(path_to_img))
36
+ embeddings.append(embedding)
37
+ return embeddings
38
+
39
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
40
+ """Embed a list of documents using BridgeTower.
41
+
42
+ Parameters:
43
+ -----------
44
+ texts: str
45
+ The list of texts to embed.
46
+
47
+ Returns:
48
+ --------
49
+ List of embeddings, one for each text.
50
+ """
51
+ embeddings = []
52
+ for text in texts:
53
+
54
+ embedding = bt_embeddings(text, "")
55
+ embeddings.append(embedding)
56
+ return embeddings
57
+
58
+ def embed_query(self, text: str) -> List[float]:
59
+ """Embed a query using BridgeTower.
60
+
61
+ Parameters:
62
+ -----------
63
+ texts: str
64
+ The text to embed.
65
+
66
+ Returns:
67
+ Embeddings for the text.
68
+ """
69
+ return self.embed_documents([text])[0]
src/preprocess/preprocessing.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path as osp
2
+ import json
3
+
4
+ import cv2
5
+ import webvtt
6
+
7
+ from utils import maintain_aspect_ratio_resize, str2time
8
+
9
+ def extract_and_save_frames_and_metadata(
10
+ path_to_video,
11
+ path_to_transcript,
12
+ path_to_save_extracted_frames,
13
+ path_to_save_metadatas):
14
+
15
+ # metadatas will store the metadata of all extracted frames
16
+ metadatas = []
17
+
18
+ # load video using cv2
19
+ video = cv2.VideoCapture(path_to_video)
20
+ # load transcript using webvtt
21
+ trans = webvtt.read(path_to_transcript)
22
+
23
+ # iterate transcript file
24
+ # for each video segment specified in the transcript file
25
+ for idx, transcript in enumerate(trans):
26
+
27
+ # get the start time and end time in seconds
28
+ start_time_ms = str2time(transcript.start)
29
+ end_time_ms = str2time(transcript.end)
30
+ # get the time in ms exactly
31
+ # in the middle of start time and end time
32
+ mid_time_ms = (end_time_ms + start_time_ms) / 2
33
+ # get the transcript, remove the next-line symbol
34
+ text = transcript.text.replace("\n", ' ')
35
+ # get frame at the middle time
36
+ video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
37
+ success, frame = video.read()
38
+ if success:
39
+ # if the frame is extracted successfully, resize it
40
+ image = maintain_aspect_ratio_resize(frame, height=350)
41
+ # save frame as JPEG file
42
+ img_fname = f'frame_{idx}.jpg'
43
+ img_fpath = osp.join(
44
+ path_to_save_extracted_frames, img_fname
45
+ )
46
+ cv2.imwrite(img_fpath, image)
47
+
48
+ # prepare the metadata
49
+ metadata = {
50
+ 'extracted_frame_path': img_fpath,
51
+ 'transcript': text,
52
+ 'video_segment_id': idx,
53
+ 'video_path': path_to_video,
54
+ 'mid_time_ms': mid_time_ms,
55
+ }
56
+ metadatas.append(metadata)
57
+
58
+ else:
59
+ print(f"ERROR! Cannot extract frame: idx = {idx}")
60
+
61
+ # save metadata of all extracted frames
62
+ fn = osp.join(path_to_save_metadatas, 'metadatas.json')
63
+ with open(fn, 'w') as outfile:
64
+ json.dump(metadatas, outfile)
65
+ return metadatas