人臉檢測后如何應對視頻取決於邊界框？

Question

當我在檢測到人臉后嘗試根據邊界框裁剪視頻時遇到了一個問題，所以這是我的代碼的子部分，請查找它，如果有任何與我的問題類似的代碼，我希望你能幫助我請告訴我

嗨，我在檢測到人臉后嘗試根據邊界框裁剪視頻時遇到了一個問題，所以這是我的代碼的子部分，請查找它，如果有任何類似於我的問題的代碼，我希望你能幫助我請告訴我

    for entry in words_data:
        # Extract speech to text data
        print('entry:', type(entry), entry)
        s_sec, s_millisec = divmod(float(entry['start']), 1)
        e_sec, e_millisec = divmod(float(entry['end']), 1)
        s_min = 0
        e_min = 0
        s_millisec = s_millisec * 1000
        e_millisec = e_millisec * 1000
        
        print('s_sec, s_millisec:', s_sec, s_millisec)

        if s_sec >= 60:
            s_min = math.floor(s_sec / 60.0)
            s_sec = s_sec % 60
        if e_sec >= 60:
            e_min = math.floor(e_sec / 60.0)
            e_sec = e_sec % 60

        # Determine video frames involved in stt entry
        min_frame = s_min*fps*60 + (s_sec*fps)
        max_frame = e_min*fps*60 + (e_sec*fps)

        
        
        
        # go to min_frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, min_frame)

        frame_count = min_frame
        # read frames from min_frame to max_frame
        num_people = 0
        
        valid_video = True
        bbx = []
        bby = []
        bbh = []
        bbw = []
        
        bbx1 = []
        bby1 = []
        bbx2 = []
        bby2 = []
        
        landmarks = []
        angles = []
        x = []
        y = []
        w = []
        h = []
        

        consecutive_frames_no_people = 0
        while frame_count < max_frame:    
            if count == 0:
                t = cv2.getTickCount()

            # capture next frame
            ret, frame = cap.read()
            
            if not ret:
                continue
                
                
            #frame = cv2.resize(frame,(0, 0), fx=scale, fy=scale,interpolation=cv2.INTER_LINEAR)
            #frame = cv2.resize(frame, (480, 640),interpolation=cv2.INTER_LINEAR)
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                   
            num_people = hog_face_detector(gray,1)
            
          
            # if it detects less than or more than 1 person
            # go to next subtitle
            if len(num_people) != 1:                    
                consecutive_frames_no_people += 1
                
            if consecutive_frames_no_people >= max_bad_frames:
                print(consecutive_frames_no_people,
                    ' frames without 1 person. Skiping to next subtitle')
                valid_video = False
                break
            
            ## if only one person in the scene
            if len(num_people) == 1:
                consecutive_frames_no_people = 0
                rects = hog_face_detector(gray,1)
                
                for (i, rect) in enumerate(rects):
                    # determine the facial landmarks for the face region, then
                    # convert the facial landmark (x, y)-coordinates to a NumPy
                    # array
                    shape = dlib_facelandmark(gray, rect)
                    shape = face_utils.shape_to_np(shape)
                    #bb = bounding_boxes[0]
                    #x1, y1 = int(rect.left()), int(rect.top())
                    #x2, y2 = int(rect.right()), int(rect.bottom())

                    #area = (x2 - x1) * (y2 - y1)
                    #if area < min_area:
                    #    valid_video = False
                    #    break

                    #save bounding box coordinates for final crop
                    #bbx1.append(bb[0])
                    #bby1.append(bb[1])
                    #bbx2.append(bb[2])
                    #bby2.append(bb[3])
                 
                    # convert dlib's rectangle to a OpenCV-style bounding box
                    # [i.e., (x, y, w, h)], then draw the face bounding box
                    bb = face_utils.rect_to_bb(rect)
                    #(x, y, w, h) = face_utils.rect_to_bb(rect)
                    cv2.rectangle(frame, (bb[0], bb[1]), (bb[0] + bb[2], bb[1] + bb[3]), (0, 255, 0), 2)
                    
                    
                    #save bounding box coordinates for final crop
                    bbx1.append(bb[0])
                    bby1.append(bb[1])
                    bbx2.append(bb[2])
                    bby2.append(bb[3])

                    
                    # Put fps at which we are processing camera feed on frame
                    cv2.putText(frame, "{0:.2f}-fps".format(fps_processing),
                                    (50, height-50), cv2.FONT_HERSHEY_COMPLEX,
                                    1, (0, 0, 255), 2)
                    
                    
                    
            # Display the image
            cv2.imshow('Vid',frame)
        
            # Read keyboard and exit if ESC was pressed
            k = cv2.waitKey(1) & 0xFF
            if k ==27:
                exit()
            elif k == ord('q'):
                stop_videos = True

            # increment frame counter
            count = count + 1
            # calculate fps at an interval of 100 frames
            if (count == 30):
                t = (cv2.getTickCount() - t)/cv2.getTickFrequency()
                fps_processing = 30.0/t
                count = 0

        # if this was a valid video
        #if valid_video and len(landmarks) > 0:
        #    num_output_video += 1

            #entry['mouth3d'] = landmarks
            #entry['angle'] = angles

        if valid_video and len(bb) > 0:
            num_output_video += 1
            
            bbx1 = np.amin(bbx1)
            bbx2 = np.amax(bbx2)
            bby1 = np.amin(bby1)
            bby2 = np.amax(bby2)
            bbw = bbx2 - bbx1
            bbh = bby2 - bby1




            entry['bounding_box'] = [bb[0], bb[1], bb[2], bb[3]]
            entry['landmark'] = bb
            print('entry:', type(entry), entry)

            if save_videos:
                s_hr = 0
                e_hr = 0
                if s_min >= 60:
                    s_hr = math.floor(s_min / 60)
                    s_min = s_min % 60
                if e_min >= 60:
                    e_hr = math.floor(e_min / 60)
                    e_min = e_min % 60

                # cut and crop video
                # ffmpeg -i input.mp4 -ss hh:mm:ss -filter:v crop=w:h:x:y -c:a copy -to hh:mm:ss output.mp4
                ss = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
                    s_hr, s_min, int(s_sec), math.ceil(s_millisec))
                es = "{0:02d}:{1:02d}:{2:02d}.{3:03d}".format(
                    e_hr, e_min, int(e_sec), math.ceil(e_millisec))
                crop = "crop={0:1d}:{1:1d}:{2:1d}:{3:1d}".format(
                    bbw, bbh, bbx1, bby1)

                out_name = os.path.join(output_dir, str(num_output_video))

                subprocess.call(['ffmpeg', #'-hide_banner', '-loglevel', 'panic',
                                '-i', os.path.join(
                                videos_directory, vids_name, video_name),
                                '-ss', ss,
                                '-filter:v', crop, '-c:a', 'copy',
                                '-to', es, out_name +'.mp4'])
                                        # save recognized speech
                text_file = open(out_name +'.txt', "w")
                text_file.write(entry['text'] + '\n')
                text_file.write(str(entry['conf']))
                text_file.close()

這是output，那么，我如何為視頻編寫最終的BB？


found 4 files
Processing video: health_news_1.mp4
video resolution: 608  x  1080
video framerate: 25.0
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'شانغهاي', 'conf': 0.58, 'start': 1.6, 'end': 2.24, 'bounding_box': []}
s_sec, s_millisec: 1.0 600.0000000000001
10  frames without 1 person. Skiping to next subtitle
entry: <class 'dict'> {'link': 'build_Dataset', 'text': 'تواجه', 'conf': 0.65, 'start': 2.24, 'end': 2.72, 'bounding_box': []}
s_sec, s_millisec: 2.0 240.00000000000023
Traceback (most recent call last):
  File "extract_subvideos.py", line 467, in <module>
    main(args)
  File "extract_subvideos.py", line 315, in main
    bbx1 = np.amin(bbx1)
  File "<__array_function__ internals>", line 5, in amin
  File "/Users/shaimaa/.local/lib/python3.8/site-packages/numpy/core/fromnumeric.py", line 2879, in amin
    return _wrapreduction(a, np.minimum, 'min', axis, None, out,
  File "/Users/shaimaa/.local/lib/python3.8/site-packages/numpy/core/fromnumeric.py", line 86, in _wrapreduction
    return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
ValueError: zero-size array to reduction operation minimum which has no identity

Answer 1

錯誤表明您嘗試從空列表bbx1獲取amin() 。

我無法測試它，但我認為問題是你清除bbx1 = []但你沒有清除bb = []以后它可能有前一個循環中的非空bb和當前循環中的空bbx1 （因為當它找到超過 1 個人然后它不會將值從bb添加到bbx1 ）並且它可能會在if... len(bb) > 1:中運行代碼並嘗試使用空bbx1

你應該檢查len(bbx1)

if valid_video and len(bb) > 0 and len(bbx1) > 0:

或者您將不得不使用一些 boolean 變量found_one_person ，您將在while -loop 之前將其重置為False ，並在找到一個人時設置True （ if len(num_people) == 1: ），並使用它來寫入數據

if valid_video and found_one_person:

人臉檢測后如何應對視頻取決於邊界框？

問題描述

1 個解決方案

解決方案1
0 2022-08-18 16:31:42

人臉檢測后如何應對視頻取決於邊界框？

問題描述

1 個解決方案

解決方案1 0 2022-08-18 16:31:42

解決方案1
0 2022-08-18 16:31:42