diff --git a/src/poc_demo.py b/src/poc_demo.py index 32a4a7fe7..3d62af0aa 100644 --- a/src/poc_demo.py +++ b/src/poc_demo.py @@ -208,34 +208,57 @@ class POCPredictor: self.logs = [] def detect_objects(self, frame): - """YOLO object detection - optimized for POC.""" - # Resize to square for YOLO - yolo_input = cv2.resize(frame, (640, 640)) - - # Convert HWC to CHW - yolo_input = yolo_input.transpose(2, 0, 1) - yolo_input = yolo_input[None].astype(np.float32) / 255.0 - - # Run inference - input_name = self.yolo_session.get_inputs()[0].name - outputs = self.yolo_session.run(None, {input_name: yolo_input}) - - # Parse YOLOv8 ONNX output: (1, 84, 8400) - output = outputs[0] - bboxes = output[0, :4, :].transpose() # (8400, 4) - class_scores = output[0, 4:, :] # (80, 8400) - classes = np.argmax(class_scores, axis=0) - confs = np.max(class_scores, axis=0) - - # Filter by confidence and relevant classes (phone and person) - relevant_classes = [0, 67] # person, cell phone - mask = (confs > CONFIG['conf_threshold']) & np.isin(classes, relevant_classes) - - return { - 'bboxes': bboxes[mask], - 'confs': confs[mask], - 'classes': classes[mask] - } + """YOLO object detection - optimized for POC with performance improvements.""" + try: + # Resize to square for YOLO (use INTER_LINEAR for speed) + yolo_input = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR) + + # Convert HWC to CHW (optimized) + yolo_input = yolo_input.transpose(2, 0, 1) + yolo_input = np.ascontiguousarray(yolo_input[None].astype(np.float32) / 255.0) + + # Run inference + input_name = self.yolo_session.get_inputs()[0].name + outputs = self.yolo_session.run(None, {input_name: yolo_input}) + + # Parse YOLOv8 ONNX output: (1, 84, 8400) + output = outputs[0] + bboxes = output[0, :4, :].transpose() # (8400, 4) + class_scores = output[0, 4:, :] # (80, 8400) + classes = np.argmax(class_scores, axis=0).astype(np.int32) # Ensure int32 + confs = np.max(class_scores, axis=0) + + # Filter by confidence and relevant classes (phone and person) + relevant_classes = np.array([0, 67], dtype=np.int32) # person, cell phone + conf_mask = confs > CONFIG['conf_threshold'] + class_mask = np.isin(classes, relevant_classes) + mask = conf_mask & class_mask + + # Ensure mask is boolean and arrays are properly indexed + mask = mask.astype(bool) + + # Get indices where mask is True + valid_indices = np.where(mask)[0] + + if len(valid_indices) > 0: + return { + 'bboxes': bboxes[valid_indices], + 'confs': confs[valid_indices], + 'classes': classes[valid_indices] + } + else: + return { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } + except Exception as e: + logger.error(f"Error in detect_objects: {e}") + return { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } def analyze_face(self, frame): """OpenCV face analysis - NO MediaPipe!""" @@ -286,17 +309,23 @@ class POCPredictor: return has_seatbelt, confidence def process_frame(self, frame, frame_idx, last_results=None): - """Process single frame - streamlined and optimized.""" + """Process single frame - streamlined and optimized with smooth video support.""" should_process = (frame_idx % CONFIG['inference_skip'] == 0) - # If not processing this frame, return last results + # Always use last results for smooth video (even if not processing this frame) if not should_process and last_results is not None: last_alerts = last_results[0] - last_face_data = last_results[1] - annotated = self.draw_detections(frame, {'bboxes': [], 'confs': [], 'classes': []}, - last_face_data, last_alerts) - return last_alerts, annotated, False, last_face_data + last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0} + last_detections = last_results[6] if len(last_results) > 6 else { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } + # Draw last predictions on current frame for smooth video + annotated = self.draw_detections(frame, last_detections, last_face_data, last_alerts) + return last_alerts, annotated, False, last_results[2] if len(last_results) > 2 else False, \ + last_results[3] if len(last_results) > 3 else 0.0, last_face_data, last_detections # Process this frame start_time = time.time() @@ -306,7 +335,11 @@ class POCPredictor: if not face_data['present']: alerts = {'Driver Absent': True} - detections = {'bboxes': [], 'confs': [], 'classes': []} + detections = { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } seatbelt, belt_conf = False, 0.0 else: # Run object detection @@ -327,7 +360,14 @@ class POCPredictor: alerts['Drowsiness'] = face_data['perclos'] > CONFIG['perclos_threshold'] alerts['Distraction'] = abs(face_data['head_yaw']) > (CONFIG['head_pose_threshold'] * 0.8) alerts['Driver Absent'] = not face_data['present'] - alerts['Phone Detected'] = np.any(detections['classes'] == 67) if len(detections['classes']) > 0 else False + # Safe check for phone detection + phone_detected = False + if len(detections['classes']) > 0: + try: + phone_detected = np.any(detections['classes'] == 67) + except: + phone_detected = False + alerts['Phone Detected'] = phone_detected alerts['No Seatbelt'] = not seatbelt and belt_conf > 0.3 # Update states with temporal smoothing (clear alerts when condition stops) @@ -363,34 +403,48 @@ class POCPredictor: if len(self.logs) > CONFIG['max_logs']: self.logs = self.logs[-CONFIG['max_logs']:] - return alerts, annotated_frame, True, seatbelt, belt_conf, face_data + return alerts, annotated_frame, True, seatbelt, belt_conf, face_data, detections def draw_detections(self, frame, detections, face_data, alerts): """Draw detections and alerts on frame.""" annotated = frame.copy() h, w = annotated.shape[:2] - # Draw bounding boxes - for i, (bbox, conf, cls) in enumerate(zip(detections['bboxes'], detections['confs'], detections['classes'])): - # Scale bbox from 640x640 to frame size - x1, y1, x2, y2 = bbox - x1, x2 = int(x1 * w / 640), int(x2 * w / 640) - y1, y2 = int(y1 * h / 640), int(y2 * h / 640) - - # Color by class - if cls == 0: # person - color = (0, 255, 0) # Green - label = "Person" - elif cls == 67: # phone - color = (255, 0, 255) # Magenta - label = "Phone" - else: - color = (255, 255, 0) # Cyan - label = "Object" - - cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2) - cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, y1-10), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + # Draw bounding boxes (safe iteration) + if len(detections['bboxes']) > 0 and len(detections['confs']) > 0 and len(detections['classes']) > 0: + try: + # Ensure all arrays have same length + min_len = min(len(detections['bboxes']), len(detections['confs']), len(detections['classes'])) + for i in range(min_len): + bbox = detections['bboxes'][i] + conf = float(detections['confs'][i]) + cls = int(detections['classes'][i]) + + # Scale bbox from 640x640 to frame size + x1, y1, x2, y2 = bbox + x1, x2 = int(x1 * w / 640), int(x2 * w / 640) + y1, y2 = int(y1 * h / 640), int(y2 * h / 640) + + # Ensure coordinates are valid + x1, x2 = max(0, min(x1, w)), max(0, min(x2, w)) + y1, y2 = max(0, min(y1, h)), max(0, min(y2, h)) + + # Color by class + if cls == 0: # person + color = (0, 255, 0) # Green + label = "Person" + elif cls == 67: # phone + color = (255, 0, 255) # Magenta + label = "Phone" + else: + color = (255, 255, 0) # Cyan + label = "Object" + + cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2) + cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, max(y1-10, 10)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + except Exception as e: + logger.warning(f"Error drawing detections: {e}") # Draw face status if face_data['present']: @@ -457,37 +511,54 @@ def video_capture_loop(predictor, frame_queue, video_source=None): time.sleep(0.1) continue + # Always process frame (for smooth video - shows all frames with last predictions) try: results = predictor.process_frame(frame, frame_idx, last_results) alerts = results[0] processed_frame = results[1] was_processed = results[2] + # Update last results if we got new predictions if was_processed: last_results = results + # If not processed, we still use last_results for drawing (already handled in process_frame) + except Exception as e: - logger.error(f"Error processing frame: {e}") - processed_frame = frame - alerts = {} - was_processed = False + logger.error(f"Error processing frame: {e}", exc_info=True) + # On error, show raw frame with last predictions if available + if last_results: + try: + last_alerts = last_results[0] + last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0} + last_detections = last_results[6] if len(last_results) > 6 else {'bboxes': np.array([]), 'confs': np.array([]), 'classes': np.array([])} + processed_frame = predictor.draw_detections(frame, last_detections, last_face_data, last_alerts) + except: + processed_frame = frame + else: + processed_frame = frame frame_idx += 1 + # Convert to RGB for display frame_rgb = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB) + # Always put frame in queue (smooth video - all frames shown) try: frame_queue.put_nowait(frame_rgb) except queue.Full: + # If queue is full, replace oldest frame try: frame_queue.get_nowait() frame_queue.put_nowait(frame_rgb) except queue.Empty: pass + # Frame rate control if video_source is not None: fps = cap.get(cv2.CAP_PROP_FPS) or 30 time.sleep(1.0 / fps) else: + # For camera, target 30 FPS (smooth video) time.sleep(0.033) cap.release() @@ -683,7 +754,7 @@ with col1: else: try: frame = frame_queue.get_nowait() - video_placeholder.image(frame, channels='RGB', use_container_width=True) + video_placeholder.image(frame, channels='RGB', width='stretch') except queue.Empty: video_placeholder.info("Waiting for camera feed...") diff --git a/src/poc_demo_rpi.py b/src/poc_demo_rpi.py index c2f17eea3..3d62af0aa 100644 --- a/src/poc_demo_rpi.py +++ b/src/poc_demo_rpi.py @@ -209,33 +209,56 @@ class POCPredictor: def detect_objects(self, frame): """YOLO object detection - optimized for POC with performance improvements.""" - # Resize to square for YOLO (use INTER_LINEAR for speed) - yolo_input = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR) - - # Convert HWC to CHW (optimized) - yolo_input = yolo_input.transpose(2, 0, 1) - yolo_input = np.ascontiguousarray(yolo_input[None].astype(np.float32) / 255.0) - - # Run inference - input_name = self.yolo_session.get_inputs()[0].name - outputs = self.yolo_session.run(None, {input_name: yolo_input}) - - # Parse YOLOv8 ONNX output: (1, 84, 8400) - output = outputs[0] - bboxes = output[0, :4, :].transpose() # (8400, 4) - class_scores = output[0, 4:, :] # (80, 8400) - classes = np.argmax(class_scores, axis=0) - confs = np.max(class_scores, axis=0) - - # Filter by confidence and relevant classes (phone and person) - relevant_classes = [0, 67] # person, cell phone - mask = (confs > CONFIG['conf_threshold']) & np.isin(classes, relevant_classes) - - return { - 'bboxes': bboxes[mask], - 'confs': confs[mask], - 'classes': classes[mask] - } + try: + # Resize to square for YOLO (use INTER_LINEAR for speed) + yolo_input = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_LINEAR) + + # Convert HWC to CHW (optimized) + yolo_input = yolo_input.transpose(2, 0, 1) + yolo_input = np.ascontiguousarray(yolo_input[None].astype(np.float32) / 255.0) + + # Run inference + input_name = self.yolo_session.get_inputs()[0].name + outputs = self.yolo_session.run(None, {input_name: yolo_input}) + + # Parse YOLOv8 ONNX output: (1, 84, 8400) + output = outputs[0] + bboxes = output[0, :4, :].transpose() # (8400, 4) + class_scores = output[0, 4:, :] # (80, 8400) + classes = np.argmax(class_scores, axis=0).astype(np.int32) # Ensure int32 + confs = np.max(class_scores, axis=0) + + # Filter by confidence and relevant classes (phone and person) + relevant_classes = np.array([0, 67], dtype=np.int32) # person, cell phone + conf_mask = confs > CONFIG['conf_threshold'] + class_mask = np.isin(classes, relevant_classes) + mask = conf_mask & class_mask + + # Ensure mask is boolean and arrays are properly indexed + mask = mask.astype(bool) + + # Get indices where mask is True + valid_indices = np.where(mask)[0] + + if len(valid_indices) > 0: + return { + 'bboxes': bboxes[valid_indices], + 'confs': confs[valid_indices], + 'classes': classes[valid_indices] + } + else: + return { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } + except Exception as e: + logger.error(f"Error in detect_objects: {e}") + return { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } def analyze_face(self, frame): """OpenCV face analysis - NO MediaPipe!""" @@ -286,17 +309,23 @@ class POCPredictor: return has_seatbelt, confidence def process_frame(self, frame, frame_idx, last_results=None): - """Process single frame - streamlined and optimized.""" + """Process single frame - streamlined and optimized with smooth video support.""" should_process = (frame_idx % CONFIG['inference_skip'] == 0) - # If not processing this frame, return last results + # Always use last results for smooth video (even if not processing this frame) if not should_process and last_results is not None: last_alerts = last_results[0] - last_face_data = last_results[1] - annotated = self.draw_detections(frame, {'bboxes': [], 'confs': [], 'classes': []}, - last_face_data, last_alerts) - return last_alerts, annotated, False, last_face_data + last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0} + last_detections = last_results[6] if len(last_results) > 6 else { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } + # Draw last predictions on current frame for smooth video + annotated = self.draw_detections(frame, last_detections, last_face_data, last_alerts) + return last_alerts, annotated, False, last_results[2] if len(last_results) > 2 else False, \ + last_results[3] if len(last_results) > 3 else 0.0, last_face_data, last_detections # Process this frame start_time = time.time() @@ -306,7 +335,11 @@ class POCPredictor: if not face_data['present']: alerts = {'Driver Absent': True} - detections = {'bboxes': [], 'confs': [], 'classes': []} + detections = { + 'bboxes': np.array([], dtype=np.float32).reshape(0, 4), + 'confs': np.array([], dtype=np.float32), + 'classes': np.array([], dtype=np.int32) + } seatbelt, belt_conf = False, 0.0 else: # Run object detection @@ -327,7 +360,14 @@ class POCPredictor: alerts['Drowsiness'] = face_data['perclos'] > CONFIG['perclos_threshold'] alerts['Distraction'] = abs(face_data['head_yaw']) > (CONFIG['head_pose_threshold'] * 0.8) alerts['Driver Absent'] = not face_data['present'] - alerts['Phone Detected'] = np.any(detections['classes'] == 67) if len(detections['classes']) > 0 else False + # Safe check for phone detection + phone_detected = False + if len(detections['classes']) > 0: + try: + phone_detected = np.any(detections['classes'] == 67) + except: + phone_detected = False + alerts['Phone Detected'] = phone_detected alerts['No Seatbelt'] = not seatbelt and belt_conf > 0.3 # Update states with temporal smoothing (clear alerts when condition stops) @@ -363,34 +403,48 @@ class POCPredictor: if len(self.logs) > CONFIG['max_logs']: self.logs = self.logs[-CONFIG['max_logs']:] - return alerts, annotated_frame, True, seatbelt, belt_conf, face_data + return alerts, annotated_frame, True, seatbelt, belt_conf, face_data, detections def draw_detections(self, frame, detections, face_data, alerts): """Draw detections and alerts on frame.""" annotated = frame.copy() h, w = annotated.shape[:2] - # Draw bounding boxes - for i, (bbox, conf, cls) in enumerate(zip(detections['bboxes'], detections['confs'], detections['classes'])): - # Scale bbox from 640x640 to frame size - x1, y1, x2, y2 = bbox - x1, x2 = int(x1 * w / 640), int(x2 * w / 640) - y1, y2 = int(y1 * h / 640), int(y2 * h / 640) - - # Color by class - if cls == 0: # person - color = (0, 255, 0) # Green - label = "Person" - elif cls == 67: # phone - color = (255, 0, 255) # Magenta - label = "Phone" - else: - color = (255, 255, 0) # Cyan - label = "Object" - - cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2) - cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, y1-10), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + # Draw bounding boxes (safe iteration) + if len(detections['bboxes']) > 0 and len(detections['confs']) > 0 and len(detections['classes']) > 0: + try: + # Ensure all arrays have same length + min_len = min(len(detections['bboxes']), len(detections['confs']), len(detections['classes'])) + for i in range(min_len): + bbox = detections['bboxes'][i] + conf = float(detections['confs'][i]) + cls = int(detections['classes'][i]) + + # Scale bbox from 640x640 to frame size + x1, y1, x2, y2 = bbox + x1, x2 = int(x1 * w / 640), int(x2 * w / 640) + y1, y2 = int(y1 * h / 640), int(y2 * h / 640) + + # Ensure coordinates are valid + x1, x2 = max(0, min(x1, w)), max(0, min(x2, w)) + y1, y2 = max(0, min(y1, h)), max(0, min(y2, h)) + + # Color by class + if cls == 0: # person + color = (0, 255, 0) # Green + label = "Person" + elif cls == 67: # phone + color = (255, 0, 255) # Magenta + label = "Phone" + else: + color = (255, 255, 0) # Cyan + label = "Object" + + cv2.rectangle(annotated, (x1, y1), (x2, y2), color, 2) + cv2.putText(annotated, f"{label}: {conf:.2f}", (x1, max(y1-10, 10)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + except Exception as e: + logger.warning(f"Error drawing detections: {e}") # Draw face status if face_data['present']: @@ -457,37 +511,54 @@ def video_capture_loop(predictor, frame_queue, video_source=None): time.sleep(0.1) continue + # Always process frame (for smooth video - shows all frames with last predictions) try: results = predictor.process_frame(frame, frame_idx, last_results) alerts = results[0] processed_frame = results[1] was_processed = results[2] + # Update last results if we got new predictions if was_processed: last_results = results + # If not processed, we still use last_results for drawing (already handled in process_frame) + except Exception as e: - logger.error(f"Error processing frame: {e}") - processed_frame = frame - alerts = {} - was_processed = False + logger.error(f"Error processing frame: {e}", exc_info=True) + # On error, show raw frame with last predictions if available + if last_results: + try: + last_alerts = last_results[0] + last_face_data = last_results[5] if len(last_results) > 5 else {'present': False, 'perclos': 0, 'head_yaw': 0} + last_detections = last_results[6] if len(last_results) > 6 else {'bboxes': np.array([]), 'confs': np.array([]), 'classes': np.array([])} + processed_frame = predictor.draw_detections(frame, last_detections, last_face_data, last_alerts) + except: + processed_frame = frame + else: + processed_frame = frame frame_idx += 1 + # Convert to RGB for display frame_rgb = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB) + # Always put frame in queue (smooth video - all frames shown) try: frame_queue.put_nowait(frame_rgb) except queue.Full: + # If queue is full, replace oldest frame try: frame_queue.get_nowait() frame_queue.put_nowait(frame_rgb) except queue.Empty: pass + # Frame rate control if video_source is not None: fps = cap.get(cv2.CAP_PROP_FPS) or 30 time.sleep(1.0 / fps) else: + # For camera, target 30 FPS (smooth video) time.sleep(0.033) cap.release() @@ -683,7 +754,7 @@ with col1: else: try: frame = frame_queue.get_nowait() - video_placeholder.image(frame, channels='RGB', use_container_width=True) + video_placeholder.image(frame, channels='RGB', width='stretch') except queue.Empty: video_placeholder.info("Waiting for camera feed...")