Smooth_FrameProcessing

2025-11-25 14:48:04 +05:30 · 2025-11-25 14:48:04 +05:30 · 6b42b731b4
commit 6b42b731b4
parent 51d1ed8d2c
2 changed files with 267 additions and 125 deletions
--- a/src/poc_demo.py
+++ b/src/poc_demo.py
@ -208,34 +208,57 @@ class POCPredictor:
        self.logs = []
    
    def detect_objects(self, frame):
-        """YOLO object detection - optimized for POC."""
-        # Resize to square for YOLO
-        yolo_input = cv2.resize(frame, (640, 640))
-        
-        # Convert HWC to CHW
-        yolo_input = yolo_input.transpose(2, 0, 1)
-        yolo_input = yolo_input[None].astype(np.float32) / 255.0
-        
-        # Run inference
-        input_name = self.yolo_session.get_inputs()[0].name
-        outputs = self.yolo_session.run(None, {input_name: yolo_input})
-        
-        # Parse YOLOv8 ONNX output: (1, 84, 8400)
-        output = outputs[0]
-        bboxes = output[0, :4, :].transpose()  # (8400, 4)
-        class_scores = output[0, 4:, :]  # (80, 8400)
-        classes = np.argmax(class_scores, axis=0)
-        confs = np.max(class_scores, axis=0)
-        
-        # Filter by confidence and relevant classes (phone and person)
-        relevant_classes = [0, 67]  # person, cell phone
-        mask = (confs > CONFIG['conf_threshold']) & np.isin(classes, relevant_classes)
-        
-        return {
-            'bboxes': bboxes[mask],
-            'confs': confs[mask],
-            'classes': classes[mask]
-        }
+        """YOLO object detection - optimized for POC with performance improvements."""
+        try:
+            # Resize to square for YOLO (use INTER_LINEAR for speed)
+            yolo_input = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR)
+            
+            # Convert HWC to CHW (optimized)
+            yolo_input = yolo_input.transpose(2, 0, 1)
+            yolo_input = np.ascontiguousarray(yolo_input[None].astype(np.float32) / 255.0)
+            
+            # Run inference
+            input_name = self.yolo_session.get_inputs()[0].name
+            outputs = self.yolo_session.run(None, {input_name: yolo_input})
+            
+            # Parse YOLOv8 ONNX output: (1, 84, 8400)
+            output = outputs[0]
+            bboxes = output[0, :4, :].transpose()  # (8400, 4)
+            class_scores = output[0, 4:, :]  # (80, 8400)
+            classes = np.argmax(class_scores, axis=0).astype(np.int32)  # Ensure int32
+            confs = np.max(class_scores, axis=0)
+            
+            # Filter by confidence and relevant classes (phone and person)
+            relevant_classes = np.array([0, 67], dtype=np.int32)  # person, cell phone
+            conf_mask = confs > CONFIG['conf_threshold']
+            class_mask = np.isin(classes, relevant_classes)
+            mask = conf_mask & class_mask
+            
+            # Ensure mask is boolean and arrays are properly indexed
+            mask = mask.astype(bool)
+            
+            # Get indices where mask is True
+            valid_indices = np.where(mask)[0]
+            
+            if len(valid_indices) > 0:
+                return {
+                    'bboxes': bboxes[valid_indices],
+                    'confs': confs[valid_indices],
+                    'classes': classes[valid_indices]
+                }
+            else:
+                return {
+                    'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                    'confs': np.array([], dtype=np.float32),
+                    'classes': np.array([], dtype=np.int32)
+                }
+        except Exception as e:
+            logger.error(f"Error in detect_objects: {e}")
+            return {
+                'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                'confs': np.array([], dtype=np.float32),
+                'classes': np.array([], dtype=np.int32)
+            }
    
    def analyze_face(self, frame):
        """OpenCV face analysis - NO MediaPipe!"""
@ -286,17 +309,23 @@ class POCPredictor:
        return has_seatbelt, confidence
    
    def process_frame(self, frame, frame_idx, last_results=None):
-        """Process single frame - streamlined and optimized."""
+        """Process single frame - streamlined and optimized with smooth video support."""
        
        should_process = (frame_idx % CONFIG['inference_skip'] == 0)
        
-        # If not processing this frame, return last results
+        # Always use last results for smooth video (even if not processing this frame)
        if not should_process and last_results is not None:
            last_alerts = last_results[0]
-            last_face_data = last_results[1]
-            annotated = self.draw_detections(frame, {'bboxes': [], 'confs': [], 'classes': []}, 
-                                           last_face_data, last_alerts)
-            return last_alerts, annotated, False, last_face_data
+            last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0}
+            last_detections = last_results[6] if len(last_results) > 6 else {
+                'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                'confs': np.array([], dtype=np.float32),
+                'classes': np.array([], dtype=np.int32)
+            }
+            # Draw last predictions on current frame for smooth video
+            annotated = self.draw_detections(frame, last_detections, last_face_data, last_alerts)
+            return last_alerts, annotated, False, last_results[2] if len(last_results) > 2 else False, \
+                   last_results[3] if len(last_results) > 3 else 0.0, last_face_data, last_detections
        
        # Process this frame
        start_time = time.time()
@ -306,7 +335,11 @@ class POCPredictor:
        
        if not face_data['present']:
            alerts = {'Driver Absent': True}
-            detections = {'bboxes': [], 'confs': [], 'classes': []}
+            detections = {
+                'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                'confs': np.array([], dtype=np.float32),
+                'classes': np.array([], dtype=np.int32)
+            }
            seatbelt, belt_conf = False, 0.0
        else:
            # Run object detection
@ -327,7 +360,14 @@ class POCPredictor:
        alerts['Drowsiness'] = face_data['perclos'] > CONFIG['perclos_threshold']
        alerts['Distraction'] = abs(face_data['head_yaw']) > (CONFIG['head_pose_threshold'] * 0.8)
        alerts['Driver Absent'] = not face_data['present']
-        alerts['Phone Detected'] = np.any(detections['classes'] == 67) if len(detections['classes']) > 0 else False
+        # Safe check for phone detection
+        phone_detected = False
+        if len(detections['classes']) > 0:
+            try:
+                phone_detected = np.any(detections['classes'] == 67)
+            except:
+                phone_detected = False
+        alerts['Phone Detected'] = phone_detected
        alerts['No Seatbelt'] = not seatbelt and belt_conf > 0.3
        
        # Update states with temporal smoothing (clear alerts when condition stops)
@ -363,34 +403,48 @@ class POCPredictor:
        if len(self.logs) > CONFIG['max_logs']:
            self.logs = self.logs[-CONFIG['max_logs']:]
        
-        return alerts, annotated_frame, True, seatbelt, belt_conf, face_data
+        return alerts, annotated_frame, True, seatbelt, belt_conf, face_data, detections
    
    def draw_detections(self, frame, detections, face_data, alerts):
        """Draw detections and alerts on frame."""
        annotated = frame.copy()
        h, w = annotated.shape[:2]
        
-        # Draw bounding boxes
-        for i, (bbox, conf, cls) in enumerate(zip(detections['bboxes'], detections['confs'], detections['classes'])):
-            # Scale bbox from 640x640 to frame size
-            x1, y1, x2, y2 = bbox
-            x1, x2 = int(x1 * w / 640), int(x2 * w / 640)
-            y1, y2 = int(y1 * h / 640), int(y2 * h / 640)
-            
-            # Color by class
-            if cls == 0:  # person
-                color = (0, 255, 0)  # Green
-                label = "Person"
-            elif cls == 67:  # phone
-                color = (255, 0, 255)  # Magenta
-                label = "Phone"
-            else:
-                color = (255, 255, 0)  # Cyan
-                label = "Object"
-            
-            cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
-            cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, y1-10), 
-                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+        # Draw bounding boxes (safe iteration)
+        if len(detections['bboxes']) > 0 and len(detections['confs']) > 0 and len(detections['classes']) > 0:
+            try:
+                # Ensure all arrays have same length
+                min_len = min(len(detections['bboxes']), len(detections['confs']), len(detections['classes']))
+                for i in range(min_len):
+                    bbox = detections['bboxes'][i]
+                    conf = float(detections['confs'][i])
+                    cls = int(detections['classes'][i])
+                    
+                    # Scale bbox from 640x640 to frame size
+                    x1, y1, x2, y2 = bbox
+                    x1, x2 = int(x1 * w / 640), int(x2 * w / 640)
+                    y1, y2 = int(y1 * h / 640), int(y2 * h / 640)
+                    
+                    # Ensure coordinates are valid
+                    x1, x2 = max(0, min(x1, w)), max(0, min(x2, w))
+                    y1, y2 = max(0, min(y1, h)), max(0, min(y2, h))
+                    
+                    # Color by class
+                    if cls == 0:  # person
+                        color = (0, 255, 0)  # Green
+                        label = "Person"
+                    elif cls == 67:  # phone
+                        color = (255, 0, 255)  # Magenta
+                        label = "Phone"
+                    else:
+                        color = (255, 255, 0)  # Cyan
+                        label = "Object"
+                    
+                    cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
+                    cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, max(y1-10, 10)), 
+                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+            except Exception as e:
+                logger.warning(f"Error drawing detections: {e}")
        
        # Draw face status
        if face_data['present']:
@ -457,37 +511,54 @@ def video_capture_loop(predictor, frame_queue, video_source=None):
            time.sleep(0.1)
            continue
        
+        # Always process frame (for smooth video - shows all frames with last predictions)
        try:
            results = predictor.process_frame(frame, frame_idx, last_results)
            alerts = results[0]
            processed_frame = results[1]
            was_processed = results[2]
            
+            # Update last results if we got new predictions
            if was_processed:
                last_results = results
+            # If not processed, we still use last_results for drawing (already handled in process_frame)
+            
        except Exception as e:
-            logger.error(f"Error processing frame: {e}")
-            processed_frame = frame
-            alerts = {}
-            was_processed = False
+            logger.error(f"Error processing frame: {e}", exc_info=True)
+            # On error, show raw frame with last predictions if available
+            if last_results:
+                try:
+                    last_alerts = last_results[0]
+                    last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0}
+                    last_detections = last_results[6] if len(last_results) > 6 else {'bboxes': np.array([]), 'confs': np.array([]), 'classes': np.array([])}
+                    processed_frame = predictor.draw_detections(frame, last_detections, last_face_data, last_alerts)
+                except:
+                    processed_frame = frame
+            else:
+                processed_frame = frame
        
        frame_idx += 1
        
+        # Convert to RGB for display
        frame_rgb = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)
        
+        # Always put frame in queue (smooth video - all frames shown)
        try:
            frame_queue.put_nowait(frame_rgb)
        except queue.Full:
+            # If queue is full, replace oldest frame
            try:
                frame_queue.get_nowait()
                frame_queue.put_nowait(frame_rgb)
            except queue.Empty:
                pass
        
+        # Frame rate control
        if video_source is not None:
            fps = cap.get(cv2.CAP_PROP_FPS) or 30
            time.sleep(1.0 / fps)
        else:
+            # For camera, target 30 FPS (smooth video)
            time.sleep(0.033)
    
    cap.release()
@ -683,7 +754,7 @@ with col1:
    else:
        try:
            frame = frame_queue.get_nowait()
-            video_placeholder.image(frame, channels='RGB', use_container_width=True)
+            video_placeholder.image(frame, channels='RGB', width='stretch')
        except queue.Empty:
            video_placeholder.info("Waiting for camera feed...")

--- a/src/poc_demo_rpi.py
+++ b/src/poc_demo_rpi.py
@ -209,33 +209,56 @@ class POCPredictor:
    
    def detect_objects(self, frame):
        """YOLO object detection - optimized for POC with performance improvements."""
-        # Resize to square for YOLO (use INTER_LINEAR for speed)
-        yolo_input = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR)
-        
-        # Convert HWC to CHW (optimized)
-        yolo_input = yolo_input.transpose(2, 0, 1)
-        yolo_input = np.ascontiguousarray(yolo_input[None].astype(np.float32) / 255.0)
-        
-        # Run inference
-        input_name = self.yolo_session.get_inputs()[0].name
-        outputs = self.yolo_session.run(None, {input_name: yolo_input})
-        
-        # Parse YOLOv8 ONNX output: (1, 84, 8400)
-        output = outputs[0]
-        bboxes = output[0, :4, :].transpose()  # (8400, 4)
-        class_scores = output[0, 4:, :]  # (80, 8400)
-        classes = np.argmax(class_scores, axis=0)
-        confs = np.max(class_scores, axis=0)
-        
-        # Filter by confidence and relevant classes (phone and person)
-        relevant_classes = [0, 67]  # person, cell phone
-        mask = (confs > CONFIG['conf_threshold']) & np.isin(classes, relevant_classes)
-        
-        return {
-            'bboxes': bboxes[mask],
-            'confs': confs[mask],
-            'classes': classes[mask]
-        }
+        try:
+            # Resize to square for YOLO (use INTER_LINEAR for speed)
+            yolo_input = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR)
+            
+            # Convert HWC to CHW (optimized)
+            yolo_input = yolo_input.transpose(2, 0, 1)
+            yolo_input = np.ascontiguousarray(yolo_input[None].astype(np.float32) / 255.0)
+            
+            # Run inference
+            input_name = self.yolo_session.get_inputs()[0].name
+            outputs = self.yolo_session.run(None, {input_name: yolo_input})
+            
+            # Parse YOLOv8 ONNX output: (1, 84, 8400)
+            output = outputs[0]
+            bboxes = output[0, :4, :].transpose()  # (8400, 4)
+            class_scores = output[0, 4:, :]  # (80, 8400)
+            classes = np.argmax(class_scores, axis=0).astype(np.int32)  # Ensure int32
+            confs = np.max(class_scores, axis=0)
+            
+            # Filter by confidence and relevant classes (phone and person)
+            relevant_classes = np.array([0, 67], dtype=np.int32)  # person, cell phone
+            conf_mask = confs > CONFIG['conf_threshold']
+            class_mask = np.isin(classes, relevant_classes)
+            mask = conf_mask & class_mask
+            
+            # Ensure mask is boolean and arrays are properly indexed
+            mask = mask.astype(bool)
+            
+            # Get indices where mask is True
+            valid_indices = np.where(mask)[0]
+            
+            if len(valid_indices) > 0:
+                return {
+                    'bboxes': bboxes[valid_indices],
+                    'confs': confs[valid_indices],
+                    'classes': classes[valid_indices]
+                }
+            else:
+                return {
+                    'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                    'confs': np.array([], dtype=np.float32),
+                    'classes': np.array([], dtype=np.int32)
+                }
+        except Exception as e:
+            logger.error(f"Error in detect_objects: {e}")
+            return {
+                'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                'confs': np.array([], dtype=np.float32),
+                'classes': np.array([], dtype=np.int32)
+            }
    
    def analyze_face(self, frame):
        """OpenCV face analysis - NO MediaPipe!"""
@ -286,17 +309,23 @@ class POCPredictor:
        return has_seatbelt, confidence
    
    def process_frame(self, frame, frame_idx, last_results=None):
-        """Process single frame - streamlined and optimized."""
+        """Process single frame - streamlined and optimized with smooth video support."""
        
        should_process = (frame_idx % CONFIG['inference_skip'] == 0)
        
-        # If not processing this frame, return last results
+        # Always use last results for smooth video (even if not processing this frame)
        if not should_process and last_results is not None:
            last_alerts = last_results[0]
-            last_face_data = last_results[1]
-            annotated = self.draw_detections(frame, {'bboxes': [], 'confs': [], 'classes': []}, 
-                                           last_face_data, last_alerts)
-            return last_alerts, annotated, False, last_face_data
+            last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0}
+            last_detections = last_results[6] if len(last_results) > 6 else {
+                'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                'confs': np.array([], dtype=np.float32),
+                'classes': np.array([], dtype=np.int32)
+            }
+            # Draw last predictions on current frame for smooth video
+            annotated = self.draw_detections(frame, last_detections, last_face_data, last_alerts)
+            return last_alerts, annotated, False, last_results[2] if len(last_results) > 2 else False, \
+                   last_results[3] if len(last_results) > 3 else 0.0, last_face_data, last_detections
        
        # Process this frame
        start_time = time.time()
@ -306,7 +335,11 @@ class POCPredictor:
        
        if not face_data['present']:
            alerts = {'Driver Absent': True}
-            detections = {'bboxes': [], 'confs': [], 'classes': []}
+            detections = {
+                'bboxes': np.array([], dtype=np.float32).reshape(0, 4),
+                'confs': np.array([], dtype=np.float32),
+                'classes': np.array([], dtype=np.int32)
+            }
            seatbelt, belt_conf = False, 0.0
        else:
            # Run object detection
@ -327,7 +360,14 @@ class POCPredictor:
        alerts['Drowsiness'] = face_data['perclos'] > CONFIG['perclos_threshold']
        alerts['Distraction'] = abs(face_data['head_yaw']) > (CONFIG['head_pose_threshold'] * 0.8)
        alerts['Driver Absent'] = not face_data['present']
-        alerts['Phone Detected'] = np.any(detections['classes'] == 67) if len(detections['classes']) > 0 else False
+        # Safe check for phone detection
+        phone_detected = False
+        if len(detections['classes']) > 0:
+            try:
+                phone_detected = np.any(detections['classes'] == 67)
+            except:
+                phone_detected = False
+        alerts['Phone Detected'] = phone_detected
        alerts['No Seatbelt'] = not seatbelt and belt_conf > 0.3
        
        # Update states with temporal smoothing (clear alerts when condition stops)
@ -363,34 +403,48 @@ class POCPredictor:
        if len(self.logs) > CONFIG['max_logs']:
            self.logs = self.logs[-CONFIG['max_logs']:]
        
-        return alerts, annotated_frame, True, seatbelt, belt_conf, face_data
+        return alerts, annotated_frame, True, seatbelt, belt_conf, face_data, detections
    
    def draw_detections(self, frame, detections, face_data, alerts):
        """Draw detections and alerts on frame."""
        annotated = frame.copy()
        h, w = annotated.shape[:2]
        
-        # Draw bounding boxes
-        for i, (bbox, conf, cls) in enumerate(zip(detections['bboxes'], detections['confs'], detections['classes'])):
-            # Scale bbox from 640x640 to frame size
-            x1, y1, x2, y2 = bbox
-            x1, x2 = int(x1 * w / 640), int(x2 * w / 640)
-            y1, y2 = int(y1 * h / 640), int(y2 * h / 640)
-            
-            # Color by class
-            if cls == 0:  # person
-                color = (0, 255, 0)  # Green
-                label = "Person"
-            elif cls == 67:  # phone
-                color = (255, 0, 255)  # Magenta
-                label = "Phone"
-            else:
-                color = (255, 255, 0)  # Cyan
-                label = "Object"
-            
-            cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
-            cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, y1-10), 
-                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+        # Draw bounding boxes (safe iteration)
+        if len(detections['bboxes']) > 0 and len(detections['confs']) > 0 and len(detections['classes']) > 0:
+            try:
+                # Ensure all arrays have same length
+                min_len = min(len(detections['bboxes']), len(detections['confs']), len(detections['classes']))
+                for i in range(min_len):
+                    bbox = detections['bboxes'][i]
+                    conf = float(detections['confs'][i])
+                    cls = int(detections['classes'][i])
+                    
+                    # Scale bbox from 640x640 to frame size
+                    x1, y1, x2, y2 = bbox
+                    x1, x2 = int(x1 * w / 640), int(x2 * w / 640)
+                    y1, y2 = int(y1 * h / 640), int(y2 * h / 640)
+                    
+                    # Ensure coordinates are valid
+                    x1, x2 = max(0, min(x1, w)), max(0, min(x2, w))
+                    y1, y2 = max(0, min(y1, h)), max(0, min(y2, h))
+                    
+                    # Color by class
+                    if cls == 0:  # person
+                        color = (0, 255, 0)  # Green
+                        label = "Person"
+                    elif cls == 67:  # phone
+                        color = (255, 0, 255)  # Magenta
+                        label = "Phone"
+                    else:
+                        color = (255, 255, 0)  # Cyan
+                        label = "Object"
+                    
+                    cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2)
+                    cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, max(y1-10, 10)), 
+                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+            except Exception as e:
+                logger.warning(f"Error drawing detections: {e}")
        
        # Draw face status
        if face_data['present']:
@ -457,37 +511,54 @@ def video_capture_loop(predictor, frame_queue, video_source=None):
            time.sleep(0.1)
            continue
        
+        # Always process frame (for smooth video - shows all frames with last predictions)
        try:
            results = predictor.process_frame(frame, frame_idx, last_results)
            alerts = results[0]
            processed_frame = results[1]
            was_processed = results[2]
            
+            # Update last results if we got new predictions
            if was_processed:
                last_results = results
+            # If not processed, we still use last_results for drawing (already handled in process_frame)
+            
        except Exception as e:
-            logger.error(f"Error processing frame: {e}")
-            processed_frame = frame
-            alerts = {}
-            was_processed = False
+            logger.error(f"Error processing frame: {e}", exc_info=True)
+            # On error, show raw frame with last predictions if available
+            if last_results:
+                try:
+                    last_alerts = last_results[0]
+                    last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0}
+                    last_detections = last_results[6] if len(last_results) > 6 else {'bboxes': np.array([]), 'confs': np.array([]), 'classes': np.array([])}
+                    processed_frame = predictor.draw_detections(frame, last_detections, last_face_data, last_alerts)
+                except:
+                    processed_frame = frame
+            else:
+                processed_frame = frame
        
        frame_idx += 1
        
+        # Convert to RGB for display
        frame_rgb = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)
        
+        # Always put frame in queue (smooth video - all frames shown)
        try:
            frame_queue.put_nowait(frame_rgb)
        except queue.Full:
+            # If queue is full, replace oldest frame
            try:
                frame_queue.get_nowait()
                frame_queue.put_nowait(frame_rgb)
            except queue.Empty:
                pass
        
+        # Frame rate control
        if video_source is not None:
            fps = cap.get(cv2.CAP_PROP_FPS) or 30
            time.sleep(1.0 / fps)
        else:
+            # For camera, target 30 FPS (smooth video)
            time.sleep(0.033)
    
    cap.release()
@ -683,7 +754,7 @@ with col1:
    else:
        try:
            frame = frame_queue.get_nowait()
-            video_placeholder.image(frame, channels='RGB', use_container_width=True)
+            video_placeholder.image(frame, channels='RGB', width='stretch')
        except queue.Empty:
            video_placeholder.info("Waiting for camera feed...")