[英]How to get coordinates(or even center point) of predicted bounding box in object detection in a video using Tensorflow
[英]how coping video depend on bounding box after face detection?
當我在檢測到人臉后嘗試根據邊界框裁剪視頻時遇到了一個問題,所以這是我的代碼的子部分,請查找它,如果有任何與我的問題類似的代碼,我希望你能幫助我請告訴我
嗨,我在檢測到人臉后嘗試根據邊界框裁剪視頻時遇到了一個問題,所以這是我的代碼的子部分,請查找它,如果有任何類似於我的問題的代碼,我希望你能幫助我請告訴我
for entry in words_data:
# Extract speech to text data
print('entry:', type(entry), entry)
s_sec, s_millisec = divmod(float(entry['start']), 1)
e_sec, e_millisec = divmod(float(entry['end']), 1)
s_min = 0
e_min = 0
s_millisec = s_millisec * 1000
e_millisec = e_millisec * 1000
print('s_sec, s_millisec:', s_sec, s_millisec)
if s_sec >= 60:
s_min = math.floor(s_sec / 60.0)
s_sec = s_sec % 60
if e_sec >= 60:
e_min = math.floor(e_sec / 60.0)
e_sec = e_sec % 60
# Determine video frames involved in stt entry
min_frame = s_min*fps*60 + (s_sec*fps)
max_frame = e_min*fps*60 + (e_sec*fps)
# go to min_frame
cap.set(cv2.CAP_PROP_POS_FRAMES, min_frame)
frame_count = min_frame
# read frames from min_frame to max_frame
num_people = 0
valid_video = True
bbx = []
bby = []
bbh = []
bbw = []
bbx1 = []
bby1 = []
bbx2 = []
bby2 = []
landmarks = []
angles = []
x = []
y = []
w = []
h = []
consecutive_frames_no_people = 0
while frame_count < max_frame:
if count == 0:
t = cv2.getTickCount()
# capture next frame
ret, frame = cap.read()
if not ret:
continue
#frame = cv2.resize(frame,(0, 0), fx=scale, fy=scale,interpolation=cv2.INTER_LINEAR)
#frame = cv2.resize(frame, (480, 640),interpolation=cv2.INTER_LINEAR)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
num_people = hog_face_detector(gray,1)
# if it detects less than or more than 1 person
# go to next subtitle
if len(num_people) != 1:
consecutive_frames_no_people += 1
if consecutive_frames_no_people >= max_bad_frames:
print(consecutive_frames_no_people,
' frames without 1 person. Skiping to next subtitle')
valid_video = False
break
## if only one person in the scene
if len(num_people) == 1:
consecutive_frames_no_people = 0
rects = hog_face_detector(gray,1)
for (i, rect) in enumerate(rects):
# determine the facial landmarks for the face region, then
# convert the facial landmark (x, y)-coordinates to a NumPy
# array
shape = dlib_facelandmark(gray, rect)
shape = face_utils.shape_to_np(shape)
#bb = bounding_boxes[0]
#x1, y1 = int(rect.left()), int(rect.top())
#x2, y2 = int(rect.right()), int(rect.bottom())
#area = (x2 - x1) * (y2 - y1)
#if area < min_area:
# valid_video = False
# break
#save bounding box coordinates for final crop
#bbx1.append(bb[0])
#bby1.append(bb[1])
#bbx2.append(bb[2])
#bby2.append(bb[3])
# convert dlib's rectangle to a OpenCV-style bounding box
# [i.e., (x, y, w, h)], then draw the face bounding box
bb = face_utils.rect_to_bb(rect)
#(x, y, w, h) = face_utils.rect_to_bb(rect)
cv2.rectangle(frame, (bb[0], bb[1]), (bb[0] + bb[2], bb[1] + bb[3]), (0, 255, 0), 2)
#save bounding box coordinates for final crop
bbx1.append(bb[0])
bby1.append(bb[1])
bbx2.append(bb[2])
bby2.append(bb[3])
# Put fps at which we are processing camera feed on frame
cv2.putText(frame, "{0:.2f}-fps".format(fps_processing),
(50, height-50), cv2.FONT_HERSHEY_COMPLEX,
1, (0, 0, 255), 2)
# Display the image
cv2.imshow('Vid',frame)
# Read keyboard and exit if ESC was pressed
k = cv2.waitKey(1) & 0xFF
if k ==27:
exit()
elif k == ord('q'):
stop_videos = True
# increment frame counter
count = count + 1
# calculate fps at an interval of 100 frames
if (count == 30):
t = (cv2.getTickCount() - t)/cv2.getTickFrequency()
fps_processing = 30.0/t
count = 0
# if this was a valid video
#if valid_video and len(landmarks) > 0:
# num_output_video += 1
#entry['mouth3d'] = landmarks
#entry['angle'] = angles
if valid_video and len(bb) > 0:
num_output_video += 1
bbx1 = np.amin(bbx1)
bbx2 = np.amax(bbx2)
bby1 = np.amin(bby1)
bby2 = np.amax(bby2)
bbw = bbx2 - bbx1
bbh = bby2 - bby1
entry['bounding_box'] = [bb[0], bb[1], bb[2], bb[3]]
entry['landmark'] = bb
print('entry:', type(entry), entry)
if save_videos:
s_hr = 0
e_hr = 0
if s_min >= 60:
s_hr = math.floor(s_min / 60)
s_min = s_min % 60
if e_min >= 60:
e_hr = math.floor(e_min / 60)
e_min = e_min % 60
# cut and crop video
# ffmpeg -i input.mp4 -ss hh:mm:ss -filter:v crop=w:h:x:y -c:a copy -to hh:mm:ss output.mp4
ss = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
s_hr, s_min, int(s_sec), math.ceil(s_millisec))
es = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
e_hr, e_min, int(e_sec), math.ceil(e_millisec))
crop = "crop={0:1d}:{1:1d}:{2:1d}:{3:1d}".format(
bbw, bbh, bbx1, bby1)
out_name = os.path.join(output_dir, str(num_output_video))
subprocess.call(['ffmpeg', #'-hide_banner', '-loglevel', 'panic',
'-i', os.path.join(
videos_directory, vids_name, video_name),
'-ss', ss,
'-filter:v', crop, '-c:a', 'copy',
'-to', es, out_name +'.mp4'])
# save recognized speech
text_file = open(out_name +'.txt', "w")
text_file.write(entry['text'] + '\n')
text_file.write(str(entry['conf']))
text_file.close()
這是output,那么,我如何為視頻編寫最終的BB?
found 4 files
Processing video: health_news_1.mp4
video resolution: 608 x 1080
video framerate: 25.0
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'شانغهاي', 'conf': 0.58, 'start': 1.6, 'end': 2.24, 'bounding_box': []}
s_sec, s_millisec: 1.0 600.0000000000001
10 frames without 1 person. Skiping to next subtitle
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'تواجه', 'conf': 0.65, 'start': 2.24, 'end': 2.72, 'bounding_box': []}
s_sec, s_millisec: 2.0 240.00000000000023
Traceback (most recent call last):
File "extract_subvideos.py", line 467, in <module>
main(args)
File "extract_subvideos.py", line 315, in main
bbx1 = np.amin(bbx1)
File "<__array_function__ internals>", line 5, in amin
File "/Users/shaimaa/.local/lib/python3.8/site-packages/numpy/core/fromnumeric.py", line 2879, in amin
return _wrapreduction(a, np.minimum, 'min', axis, None, out,
File "/Users/shaimaa/.local/lib/python3.8/site-packages/numpy/core/fromnumeric.py", line 86, in _wrapreduction
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
ValueError: zero-size array to reduction operation minimum which has no identity
錯誤表明您嘗試從空列表bbx1
獲取amin()
。
我無法測試它,但我認為問題是你清除bbx1 = []
但你沒有清除bb = []
以后它可能有前一個循環中的非空bb
和當前循環中的空bbx1
(因為當它找到超過 1 個人然后它不會將值從bb
添加到bbx1
)並且它可能會在if... len(bb) > 1:
中運行代碼並嘗試使用空bbx1
你應該檢查len(bbx1)
if valid_video and len(bb) > 0 and len(bbx1) > 0:
或者您將不得不使用一些 boolean 變量found_one_person
,您將在while
-loop 之前將其重置為False
,並在找到一個人時設置True
( if len(num_people) == 1:
),並使用它來寫入數據
if valid_video and found_one_person:
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.