import streamlit as st import cv2 import numpy as np import threading import time import logging import os import queue from datetime import datetime import yaml from ultralytics import YOLO import mediapipe as mp from roboflow import Roboflow from sklearn.ensemble import IsolationForest from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification import torch import onnxruntime as ort # For quantized inference # Setup logging for traceability logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('predictions.log'), logging.StreamHandler()]) logger = logging.getLogger(__name__) # Config (save as config.yaml or inline) CONFIG = { 'yolo_base': 'yolov8n.pt', # COCO pretrained 'conf_threshold': 0.7, 'perclos_threshold': 0.35, 'distraction_duration': 3, # seconds 'ttc_threshold': 2.5, # for FCW 'speed_limit': 60, # km/h sim 'min_tailgate_dist': 5, # meters est 'roboflow_api_key': 'gwfyWZIBeb6RIQfbU4ha', # Replace 'videomae_model': 'MCG-NJU/videomae-base', 'inference_skip': 3, # Frames between inferences } @st.cache_resource def load_models(): """Load all pre-trained models efficiently.""" # YOLO Base (vehicles, peds, phones) yolo_base = YOLO(CONFIG['yolo_base']) # Export to ONNX only if file doesn't exist (int8 quantization not supported in Ultralytics ONNX export) onnx_path = 'yolov8n.onnx' if not os.path.exists(onnx_path): yolo_base.export(format='onnx', simplify=True) # Simplify for faster inference logger.info(f"Exported YOLO to {onnx_path}") yolo_session = ort.InferenceSession(onnx_path) # Seatbelt (Roboflow pretrained) rf = Roboflow(api_key=CONFIG['roboflow_api_key']) seatbelt_project = rf.workspace('karan-panja').project('seat-belt-detection-uhqwa') seatbelt_model = seatbelt_project.version(1).model # VideoMAE for actions (zero-shot) - DISABLED: Too heavy for low-spec/Raspberry Pi # JIT scripting fails with transformers, and model is too large for edge devices # TODO: Replace with lightweight MediaPipe Pose-based action detection processor = None videomae = None logger.warning("VideoMAE disabled - too heavy for low-spec CPUs. Action recognition will use face analysis only.") # MediaPipe for face/PERCLOS mp_face_mesh = mp.solutions.face_mesh face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, refine_landmarks=True) # Isolation Forest for anomalies - train with dummy data for now # TODO: Replace with real training data from normal driving scenarios iso_forest = IsolationForest(contamination=0.1, random_state=42) # Train with dummy "normal" data (3 features: perclos, phone_action, avg_confidence) # Normal values: low perclos (<0.3), no phone (0), good confidence (>0.5) dummy_normal_data = np.random.rand(100, 3) * np.array([0.3, 0.1, 0.3]) + np.array([0.0, 0.0, 0.5]) iso_forest.fit(dummy_normal_data) logger.info("Isolation Forest trained with dummy data (replace with real training data)") return yolo_session, seatbelt_model, (processor, videomae), face_mesh, iso_forest class RealTimePredictor: def __init__(self): self.yolo_session, self.seatbelt_model, self.videomae, self.face_mesh, self.iso_forest = load_models() self.frame_buffer = [] # For temporal (last 10 frames) self.alert_states = {alert: False for alert in [ 'Drowsiness', 'Distraction', 'Smoking', 'No Seatbelt', 'Driver Absent', 'FCW', 'LDW', 'Pedestrian', 'Hard Braking', 'Hard Acceleration', 'Tailgating', 'Overspeed' ]} self.last_inference = 0 self.logs = [] def preprocess_frame(self, frame): """Resize and normalize for speed.""" frame = cv2.resize(frame, (640, 480)) return frame def detect_objects(self, frame): """YOLO for vehicles, peds, phones.""" # ONNX inference (fast) # YOLO expects square input (640x640) in BCHW format (batch, channels, height, width) # Current frame is HWC format (height, width, channels) after resize to (480, 640, 3) # Resize to square for YOLO yolo_input = cv2.resize(frame, (640, 640)) # Convert HWC to CHW: (640, 640, 3) -> (3, 640, 640) yolo_input = yolo_input.transpose(2, 0, 1) # Add batch dimension and normalize: (3, 640, 640) -> (1, 3, 640, 640) yolo_input = yolo_input[None].astype(np.float32) / 255.0 input_name = self.yolo_session.get_inputs()[0].name inputs = {input_name: yolo_input} outputs = self.yolo_session.run(None, inputs) # YOLOv8 ONNX output format: (1, 84, 8400) = (batch, features, detections) # Features: 4 (bbox xyxy) + 80 (COCO classes) = 84 # Detections: 8400 anchor points output = outputs[0] # Shape: (1, 84, 8400) # Extract bboxes: first 4 features, all detections -> (4, 8400) -> transpose to (8400, 4) bboxes = output[0, :4, :].transpose() # (8400, 4) in xyxy format # Extract class scores: features 4:84, all detections -> (80, 8400) class_scores = output[0, 4:, :] # (80, 8400) # Get class indices and confidences classes = np.argmax(class_scores, axis=0) # (8400,) class indices confs = np.max(class_scores, axis=0) # (8400,) confidence scores # Filter by confidence threshold high_conf = confs > CONFIG['conf_threshold'] # Scale bboxes back to original frame size (from 640x640 to original frame size) # Note: bboxes are in 640x640 coordinate space, need to scale if frame was different size # For now, return as-is (will need proper scaling if using different input sizes) return {'bboxes': bboxes[high_conf], 'confs': confs[high_conf], 'classes': classes[high_conf]} def detect_seatbelt(self, frame): """Roboflow seatbelt.""" predictions = self.seatbelt_model.predict(frame, confidence=CONFIG['conf_threshold']).json() has_belt = any(p['class'] == 'with_mask' for p in predictions['predictions']) # Adapt class return has_belt, predictions[0]['confidence'] if predictions['predictions'] else 0 def analyze_face(self, frame): """MediaPipe PERCLOS, head pose, absence.""" rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) results = self.face_mesh.process(rgb) if not results.multi_face_landmarks: return {'perclos': 0, 'head_pose': [0,0,0], 'absent': True, 'conf': 0} landmarks = results.multi_face_landmarks[0].landmark # PERCLOS (eye closure %) left_eye = np.mean([landmarks[i].y for i in [33, 7, 163, 144]]) right_eye = np.mean([landmarks[i].y for i in [362, 382, 381, 380]]) ear = (landmarks[10].y + landmarks[152].y) / 2 # Eye aspect simplified perclos = max((left_eye - ear) / (ear - min(left_eye, ear)), (right_eye - ear) / (ear - min(right_eye, ear))) # Head pose (simplified yaw for looking away) yaw = (landmarks[454].x - landmarks[323].x) * 100 # Rough estimate return {'perclos': perclos, 'head_pose': [0, yaw, 0], 'absent': False, 'conf': 0.9} def recognize_actions(self, buffer): """Action recognition - VideoMAE disabled, using placeholder for now.""" # TODO: Implement lightweight action detection using MediaPipe Pose # For now, return zeros (actions detected via face analysis in validate_alerts) return {'yawn': 0, 'phone': 0, 'look_away': 0} def optical_flow(self, prev_frame, curr_frame): """OpenCV dense optical flow for speed, braking, accel estimation.""" prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY) curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY) # Use Farneback dense optical flow (correct API for full-frame flow) flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0) # Calculate magnitude of flow vectors magnitude = np.sqrt(flow[..., 0]**2 + flow[..., 1]**2) return np.mean(magnitude) # High = accel/braking; est speed ~ magnitude * scale (calib) def estimate_distance(self, bboxes): """Simple bbox size for tailgating/FCW dist est (calib needed).""" if len(bboxes) == 0: return float('inf') areas = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1]) return 10 / np.sqrt(np.max(areas)) # Inverse sqrt for dist (rough) def detect_anomaly(self, features): """Flag unusual (low conf).""" pred = self.iso_forest.predict(features.reshape(1, -1))[0] return 1 if pred == -1 else 0 def validate_alerts(self, frame, prev_frame, detections, face_data, actions, seatbelt, flow_mag, buffer): """Rule-based validation for all alerts.""" features = np.array([face_data['perclos'], actions['phone'], detections['confs'].mean() if len(detections['confs']) else 0]) anomaly = self.detect_anomaly(features) results = {} timestamp = datetime.now().isoformat() # DSMS drowsy = (face_data['perclos'] > CONFIG['perclos_threshold']) and (actions['yawn'] > CONFIG['conf_threshold']) results['Drowsiness'] = drowsy and not anomaly distraction = (actions['phone'] > CONFIG['conf_threshold']) or (abs(face_data['head_pose'][1]) > 20) results['Distraction'] = distraction and not anomaly smoke = 'cigarette' in [c for c in detections['classes']] # YOLO class proxy results['Smoking'] = smoke and detections['confs'][detections['classes'] == 67].max() > CONFIG['conf_threshold'] results['No Seatbelt'] = not seatbelt[0] and seatbelt[1] > CONFIG['conf_threshold'] results['Driver Absent'] = face_data['absent'] # ADAS (heuristics) vehicles = sum(1 for c in detections['classes'] if c == 2) # Car class peds = sum(1 for c in detections['classes'] if c == 0) dist_est = self.estimate_distance(detections['bboxes'][detections['classes'] == 2]) ttc = dist_est / (flow_mag + 1e-5) if flow_mag > 0 else float('inf') # Rough TTC results['FCW'] = (ttc < CONFIG['ttc_threshold']) and vehicles > 0 results['Tailgating'] = (dist_est < CONFIG['min_tailgate_dist']) and vehicles > 0 results['Pedestrian'] = peds > 0 and detections['confs'][detections['classes'] == 0].max() > CONFIG['conf_threshold'] # LDW: Simple edge detect for lane (OpenCV) gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) edges = cv2.Canny(gray, 50, 150) lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100) in_lane = len(lines) > 2 if lines is not None else False # Basic: many lines = on lane results['LDW'] = not in_lane # Braking/Accel/Overspeed via flow magnitude # Note: flow_mag is now a scalar (mean magnitude), direction detection needs full flow array # For now, use magnitude threshold - TODO: Add direction analysis for better detection speed_est = flow_mag * 0.1 # Calib: km/h proxy (needs calibration) braking = flow_mag > 15 # High magnitude suggests sudden change accel = flow_mag > 12 and flow_mag < 15 # Moderate-high magnitude results['Hard Braking'] = braking results['Hard Acceleration'] = accel results['Overspeed'] = speed_est > CONFIG['speed_limit'] # Log all log_entry = f"{timestamp} | Features: {features} | Anomaly: {anomaly} | Alerts: {results}" logger.info(log_entry) self.logs.append(log_entry[-100:]) # Last 100 chars for display # Update states (sustain if true) for alert, triggered in results.items(): if triggered: self.alert_states[alert] = True elif time.time() - self.last_inference > CONFIG['distraction_duration']: self.alert_states[alert] = False return results def run_inference(self, frame, prev_frame, buffer, frame_idx): """Full pipeline every N frames.""" if frame_idx % CONFIG['inference_skip'] != 0: return {}, frame start = time.time() frame = self.preprocess_frame(frame) detections = self.detect_objects(frame) seatbelt = self.detect_seatbelt(frame) face_data = self.analyze_face(frame) buffer.append(frame) buffer = buffer[-10:] # Keep last 10 actions = self.recognize_actions(buffer) flow_mag = self.optical_flow(prev_frame, frame) if prev_frame is not None else 0 alerts = self.validate_alerts(frame, prev_frame, detections, face_data, actions, seatbelt, flow_mag, buffer) self.last_inference = time.time() # Overlay for i, bbox in enumerate(detections['bboxes']): x1, y1, x2, y2 = map(int, bbox) label = f"{detections['classes'][i]}:{detections['confs'][i]:.2f}" cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(frame, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) # Alert texts for alert, active in self.alert_states.items(): if active: cv2.putText(frame, f"ALERT: {alert}", (10, 30 + list(self.alert_states.keys()).index(alert)*20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2) logger.info(f"Inference time: {time.time() - start:.2f}s") return alerts, frame def video_loop(predictor, frame_queue): """Threaded capture - puts frames in queue for main thread to display.""" cap = cv2.VideoCapture(0) # Webcam; for RPi: 'nvarguscamerasrc ! video/x-raw(memory:NVMM), width=640, height=480, framerate=30/1 ! nvvidconv ! video/x-raw, format=BGRx ! videoconvert ! video/x-raw, format=BGR ! appsink' cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) cap.set(cv2.CAP_PROP_FPS, 30) prev_frame = None buffer = [] frame_idx = 0 while True: ret, frame = cap.read() if not ret: time.sleep(0.1) continue alerts, frame = predictor.run_inference(frame, prev_frame, buffer, frame_idx) prev_frame = frame.copy() frame_idx += 1 # BGR to RGB for Streamlit frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Put frame in queue (non-blocking, drop old frames if queue full) try: frame_queue.put_nowait(frame_rgb) except queue.Full: # Queue full, remove oldest and add new try: frame_queue.get_nowait() frame_queue.put_nowait(frame_rgb) except queue.Empty: pass time.sleep(0.033) # ~30 FPS cap # Streamlit UI st.title("🚗 Real-Time DSMS/ADAS Validator") st.sidebar.title("Active Alerts") # Initialize predictor if 'predictor' not in st.session_state: st.session_state.predictor = RealTimePredictor() st.session_state.frame_queue = queue.Queue(maxsize=2) # Small queue to avoid lag st.session_state.video_thread = None predictor = st.session_state.predictor frame_queue = st.session_state.frame_queue # Start video thread if not running if st.session_state.video_thread is None or not st.session_state.video_thread.is_alive(): st.session_state.video_thread = threading.Thread( target=video_loop, args=(predictor, frame_queue), daemon=True ) st.session_state.video_thread.start() # Main video display loop video_placeholder = st.empty() # Get latest frame from queue and display try: frame = frame_queue.get_nowait() video_placeholder.image(frame, channels='RGB', use_container_width=True) except queue.Empty: # No frame available yet, show placeholder video_placeholder.info("Waiting for camera feed...") # Sidebar: Alerts & Logs with st.sidebar: st.subheader("Alerts") for alert, active in predictor.alert_states.items(): st.write(f"{'🔴' if active else '🟢'} {alert}") st.subheader("Recent Logs (Traceable)") for log in predictor.logs[-10:]: st.text(log) st.info("👆 Alerts trigger only on high conf + rules. Check `predictions.log` for full traces. Calibrate distances/speeds for your setup.") # Auto-refresh to update video feed time.sleep(0.033) # ~30 FPS st.rerun()