extraction
Browse files
lips_coords_extractor.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import dlib
|
3 |
+
import json
|
4 |
+
import glob
|
5 |
+
import os
|
6 |
+
from multiprocessing import Pool
|
7 |
+
|
8 |
+
LIP_COORDINATES_DIRECTORY = "lip_coordinates"
|
9 |
+
ERROR_DIRECTORY = "error_videos"
|
10 |
+
|
11 |
+
# path to the original GRID dataset whose videos are converted to frames
|
12 |
+
GRID_IMAGES_DIRECTORY = "lip/GRID_imgs"
|
13 |
+
train_unseen_list = "data/unseen_val.txt"
|
14 |
+
train_overlap_list = "data/overlap_train.txt"
|
15 |
+
test_unseen_list = "data/unseen_val.txt"
|
16 |
+
test_overlap_list = "data/overlap_val.txt"
|
17 |
+
|
18 |
+
|
19 |
+
def load_data_list(data_path, dictionary):
|
20 |
+
with open(data_path, "r") as f:
|
21 |
+
for line in f.readlines():
|
22 |
+
line = line.strip()
|
23 |
+
speaker = line.split("/")[-4]
|
24 |
+
vid = line.split("/")[-1]
|
25 |
+
dictionary[f"{speaker}/{vid}"] = 1
|
26 |
+
return dictionary
|
27 |
+
|
28 |
+
|
29 |
+
def extract_lip_coordinates(detector, predictor, img_path):
|
30 |
+
# used to preprocess the original image frames in the GRID dataset to extract the lip coordinates
|
31 |
+
image = cv2.imread(img_path)
|
32 |
+
image = cv2.resize(image, (600, 500))
|
33 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
34 |
+
|
35 |
+
rects = detector(gray)
|
36 |
+
assert len(rects) == 1
|
37 |
+
for rect in rects:
|
38 |
+
# extract the coordinates of the bounding box
|
39 |
+
x1 = rect.left()
|
40 |
+
y1 = rect.top()
|
41 |
+
x2 = rect.right()
|
42 |
+
y2 = rect.bottom()
|
43 |
+
|
44 |
+
# apply the shape predictor to the face ROI
|
45 |
+
shape = predictor(gray, rect)
|
46 |
+
x = []
|
47 |
+
y = []
|
48 |
+
for n in range(48, 68):
|
49 |
+
x.append(shape.part(n).x)
|
50 |
+
y.append(shape.part(n).y)
|
51 |
+
return [x, y]
|
52 |
+
|
53 |
+
|
54 |
+
def log_error_video(video_path):
|
55 |
+
print("Error: ", video_path)
|
56 |
+
with open(ERROR_DIRECTORY + "/error_videos.txt", "a") as f:
|
57 |
+
f.write(video_path + "\n")
|
58 |
+
|
59 |
+
|
60 |
+
data_dict = {}
|
61 |
+
data_dict = load_data_list(train_unseen_list, data_dict)
|
62 |
+
data_dict = load_data_list(train_overlap_list, data_dict)
|
63 |
+
data_dict = load_data_list(test_unseen_list, data_dict)
|
64 |
+
data_dict = load_data_list(test_overlap_list, data_dict)
|
65 |
+
|
66 |
+
|
67 |
+
speakers = glob.glob(GRID_IMAGES_DIRECTORY + "/*")
|
68 |
+
print(speakers[0])
|
69 |
+
|
70 |
+
|
71 |
+
def generate_lip_coordinates(speakers):
|
72 |
+
file_path_sep = "\\"
|
73 |
+
detector = dlib.get_frontal_face_detector()
|
74 |
+
predictor = dlib.shape_predictor(
|
75 |
+
"lip_coordinate_extraction/shape_predictor_68_face_landmarks_GTX.dat"
|
76 |
+
)
|
77 |
+
for speaker in speakers:
|
78 |
+
print(speaker)
|
79 |
+
videos = glob.glob(speaker + "/*")
|
80 |
+
for video in videos:
|
81 |
+
print(video)
|
82 |
+
frames = glob.glob(video + "/*.jpg")
|
83 |
+
if len(frames) < 50: # filter out bad videos
|
84 |
+
continue
|
85 |
+
vid = {}
|
86 |
+
try:
|
87 |
+
frames = sorted(
|
88 |
+
frames,
|
89 |
+
key=lambda x: int(x.split(file_path_sep)[-1].split(".")[0]),
|
90 |
+
)
|
91 |
+
for frame in frames:
|
92 |
+
retry = 3
|
93 |
+
while retry > 0:
|
94 |
+
try:
|
95 |
+
coords = extract_lip_coordinates(detector, predictor, frame)
|
96 |
+
break
|
97 |
+
except Exception as e:
|
98 |
+
retry -= 1
|
99 |
+
print("Error: ", video)
|
100 |
+
print(e)
|
101 |
+
print("retrying...")
|
102 |
+
|
103 |
+
vid[frame.split(file_path_sep)[-1].split(".")[0]] = coords
|
104 |
+
vid_path = video.split(file_path_sep)
|
105 |
+
save_path = (
|
106 |
+
LIP_COORDINATES_DIRECTORY
|
107 |
+
+ "/"
|
108 |
+
+ vid_path[-2]
|
109 |
+
+ "/"
|
110 |
+
+ vid_path[-1]
|
111 |
+
+ ".json"
|
112 |
+
)
|
113 |
+
|
114 |
+
if not os.path.exists(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2]):
|
115 |
+
os.makedirs(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2])
|
116 |
+
|
117 |
+
with open(
|
118 |
+
save_path,
|
119 |
+
"w",
|
120 |
+
) as f:
|
121 |
+
json.dump(vid, f)
|
122 |
+
except Exception as e:
|
123 |
+
print(e)
|
124 |
+
log_error_video(video)
|
125 |
+
|
126 |
+
|
127 |
+
def generate_lip_coordinates(speakers):
|
128 |
+
file_path_sep = "\\"
|
129 |
+
detector = dlib.get_frontal_face_detector()
|
130 |
+
predictor = dlib.shape_predictor(
|
131 |
+
"lip_coordinate_extraction/shape_predictor_68_face_landmarks_GTX.dat"
|
132 |
+
)
|
133 |
+
for speaker in speakers:
|
134 |
+
print(speaker)
|
135 |
+
videos = glob.glob(speaker + "/*")
|
136 |
+
for video in videos:
|
137 |
+
# if (
|
138 |
+
# video.split(file_path_sep)[-2] + "/" + video.split(file_path_sep)[-1]
|
139 |
+
# not in data_dict
|
140 |
+
# ):
|
141 |
+
# continue
|
142 |
+
print(video)
|
143 |
+
frames = glob.glob(video + "/*.jpg")
|
144 |
+
if len(frames) < 50: # filter out bad videos
|
145 |
+
continue
|
146 |
+
vid = {}
|
147 |
+
try:
|
148 |
+
frames = sorted(
|
149 |
+
frames,
|
150 |
+
key=lambda x: int(x.split(file_path_sep)[-1].split(".")[0]),
|
151 |
+
)
|
152 |
+
for frame in frames:
|
153 |
+
retry = 3
|
154 |
+
while retry > 0:
|
155 |
+
try:
|
156 |
+
coords = extract_lip_coordinates(detector, predictor, frame)
|
157 |
+
break
|
158 |
+
except Exception as e:
|
159 |
+
retry -= 1
|
160 |
+
print("Error: ", video)
|
161 |
+
print(e)
|
162 |
+
print("retrying...")
|
163 |
+
|
164 |
+
vid[frame.split(file_path_sep)[-1].split(".")[0]] = coords
|
165 |
+
vid_path = video.split(file_path_sep)
|
166 |
+
save_path = (
|
167 |
+
LIP_COORDINATES_DIRECTORY
|
168 |
+
+ "/"
|
169 |
+
+ vid_path[-2]
|
170 |
+
+ "/"
|
171 |
+
+ vid_path[-1]
|
172 |
+
+ ".json"
|
173 |
+
)
|
174 |
+
|
175 |
+
if not os.path.exists(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2]):
|
176 |
+
os.makedirs(LIP_COORDINATES_DIRECTORY + "/" + vid_path[-2])
|
177 |
+
|
178 |
+
with open(
|
179 |
+
save_path,
|
180 |
+
"w",
|
181 |
+
) as f:
|
182 |
+
json.dump(vid, f)
|
183 |
+
except Exception as e:
|
184 |
+
print(e)
|
185 |
+
log_error_video(video)
|
186 |
+
|
187 |
+
|
188 |
+
num_processes = 8
|
189 |
+
|
190 |
+
speaker_groups = []
|
191 |
+
speaker_interval = len(speakers) // num_processes
|
192 |
+
for i in range(num_processes):
|
193 |
+
if i == 4:
|
194 |
+
speaker_groups.append(speakers[i * speaker_interval :])
|
195 |
+
else:
|
196 |
+
speaker_groups.append(
|
197 |
+
speakers[i * speaker_interval : (i + 1) * speaker_interval]
|
198 |
+
)
|
199 |
+
|
200 |
+
|
201 |
+
if __name__ == "__main__":
|
202 |
+
with Pool(num_processes) as p:
|
203 |
+
p.map(generate_lip_coordinates, speaker_groups)
|
shape_predictor_68_face_landmarks_GTX.dat
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:249a69a1d5f2d7c714a92934d35367d46eb52dc308d46717e82d49e8386b3b80
|
3 |
+
size 66435981
|