YouTube Live Stream Audio Describer

Real-time AI-powered audio descriptions for YouTube live streams to assist blind users

Size

17.0 KB

Version

1.1.2

Created

Mar 12, 2026

Updated

about 1 month ago

1// ==UserScript==
2// @name		YouTube Live Stream Audio Describer
3// @description		Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
4// @version		1.1.2
5// @match		https://*.youtube.com/*
6// @icon		https://www.youtube.com/s/desktop/38f8912f/img/favicon_32x32.png
7// ==/UserScript==
8(function() {
9    'use strict';
10
11    // Configuration
12    const CONFIG = {
13        captureInterval: 8000, // Capture frame every 8 seconds
14        maxDescriptionLength: 150,
15        speechRate: 1.0,
16        speechVolume: 1.0,
17        speechPitch: 1.0
18    };
19
20    // State management
21    let isDescribing = false;
22    let captureIntervalId = null;
23    let lastDescription = '';
24    let videoElement = null;
25    let isSpeaking = false;
26    let descriptionQueue = [];
27    let currentVideoUrl = '';
28
29    // Utility: Debounce function
30    function debounce(func, wait) {
31        let timeout;
32        return function executedFunction(...args) {
33            const later = () => {
34                clearTimeout(timeout);
35                func(...args);
36            };
37            clearTimeout(timeout);
38            timeout = setTimeout(later, wait);
39        };
40    }
41
42    // Capture video frame as base64 image
43    function captureVideoFrame(video) {
44        try {
45            const canvas = document.createElement('canvas');
46            canvas.width = video.videoWidth || 640;
47            canvas.height = video.videoHeight || 360;
48            
49            const ctx = canvas.getContext('2d');
50            ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
51            
52            // Convert to base64 with reduced quality for faster processing
53            const dataUrl = canvas.toDataURL('image/jpeg', 0.7);
54            console.log('Frame captured successfully, size:', canvas.width, 'x', canvas.height);
55            return dataUrl;
56        } catch (error) {
57            console.error('Error capturing video frame:', error);
58            return null;
59        }
60    }
61
62    // Analyze frame with AI
63    async function analyzeFrame(frameDataUrl) {
64        try {
65            console.log('Sending frame to AI for analysis...');
66            
67            const prompt = `You are an audio describer for a blind person watching a YouTube video. Describe ONLY what you see happening in this specific video frame. Be concise and focus on:
68
691. What ACTION is currently happening (people talking, moving, demonstrating something)
702. Any TEXT visible on screen (titles, captions, graphics)
713. The main SUBJECT or focus of the frame
724. Any CHANGES in the scene (new person, location change, object being shown)
73
74Keep it under ${CONFIG.maxDescriptionLength} words. Be direct and specific about what's ACTUALLY in the frame right now.
75
76Analyze this video frame: ${frameDataUrl}`;
77
78            const description = await RM.aiCall(prompt);
79            console.log('AI analysis complete:', description);
80            return description;
81        } catch (error) {
82            console.error('Error analyzing frame with AI:', error);
83            return null;
84        }
85    }
86
87    // Speak description using Web Speech API
88    function speakDescription(text) {
89        if (!text || text === lastDescription) {
90            console.log('Skipping duplicate or empty description');
91            return;
92        }
93
94        // Add to queue
95        descriptionQueue.push(text);
96        lastDescription = text;
97        
98        // Process queue if not already speaking
99        if (!isSpeaking) {
100            processDescriptionQueue();
101        }
102    }
103
104    // Process description queue
105    function processDescriptionQueue() {
106        if (descriptionQueue.length === 0) {
107            isSpeaking = false;
108            return;
109        }
110
111        isSpeaking = true;
112        const text = descriptionQueue.shift();
113
114        if ('speechSynthesis' in window) {
115            // Cancel any ongoing speech
116            window.speechSynthesis.cancel();
117
118            const utterance = new SpeechSynthesisUtterance(text);
119            utterance.rate = CONFIG.speechRate;
120            utterance.volume = CONFIG.speechVolume;
121            utterance.pitch = CONFIG.speechPitch;
122            utterance.lang = 'en-US';
123
124            utterance.onend = () => {
125                console.log('Finished speaking description');
126                // Process next in queue after a short delay
127                setTimeout(() => processDescriptionQueue(), 500);
128            };
129
130            utterance.onerror = (event) => {
131                console.error('Speech synthesis error:', event);
132                isSpeaking = false;
133                processDescriptionQueue();
134            };
135
136            console.log('Speaking:', text);
137            window.speechSynthesis.speak(utterance);
138            
139            // Update status in control panel
140            updateStatus('Describing...');
141        } else {
142            console.error('Speech synthesis not supported');
143            isSpeaking = false;
144        }
145    }
146
147    // Main capture and describe loop
148    async function captureAndDescribe() {
149        if (!isDescribing || !videoElement) {
150            return;
151        }
152
153        console.log('Starting frame capture and analysis...');
154        updateStatus('Capturing frame...');
155
156        const frameData = captureVideoFrame(videoElement);
157        if (!frameData) {
158            console.error('Failed to capture frame');
159            updateStatus('Error: Failed to capture frame');
160            return;
161        }
162
163        updateStatus('Analyzing with AI...');
164        const description = await analyzeFrame(frameData);
165        
166        if (description) {
167            speakDescription(description);
168        } else {
169            updateStatus('Error: AI analysis failed');
170        }
171    }
172
173    // Start describing
174    async function startDescribing() {
175        if (isDescribing) {
176            console.log('Already describing');
177            return;
178        }
179
180        videoElement = document.querySelector('video.html5-main-video');
181        if (!videoElement) {
182            alert('No video found on this page. Please navigate to a YouTube video or live stream.');
183            return;
184        }
185
186        console.log('Starting audio descriptions...');
187        isDescribing = true;
188        updateControlPanel();
189        
190        // Initial description
191        await captureAndDescribe();
192        
193        // Set up interval for continuous descriptions
194        captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
195        
196        updateStatus('Active - Describing every ' + (CONFIG.captureInterval / 1000) + ' seconds');
197    }
198
199    // Stop describing
200    function stopDescribing() {
201        console.log('Stopping audio descriptions...');
202        isDescribing = false;
203        
204        if (captureIntervalId) {
205            clearInterval(captureIntervalId);
206            captureIntervalId = null;
207        }
208
209        // Stop any ongoing speech
210        if ('speechSynthesis' in window) {
211            window.speechSynthesis.cancel();
212        }
213        
214        // Clear queue
215        descriptionQueue = [];
216        isSpeaking = false;
217        lastDescription = '';
218        
219        updateControlPanel();
220        updateStatus('Stopped');
221    }
222
223    // Update status display
224    function updateStatus(message) {
225        const statusElement = document.getElementById('audio-describer-status');
226        if (statusElement) {
227            statusElement.textContent = message;
228        }
229    }
230
231    // Create control panel UI
232    function createControlPanel() {
233        // Remove existing panel if any
234        const existing = document.getElementById('audio-describer-panel');
235        if (existing) {
236            existing.remove();
237        }
238
239        const panel = document.createElement('div');
240        panel.id = 'audio-describer-panel';
241        panel.style.cssText = `
242            position: fixed;
243            top: 80px;
244            right: 20px;
245            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
246            color: white;
247            padding: 20px;
248            border-radius: 12px;
249            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
250            z-index: 10000;
251            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
252            min-width: 280px;
253            backdrop-filter: blur(10px);
254            cursor: move;
255        `;
256
257        panel.innerHTML = `
258            <div style="margin-bottom: 15px;">
259                <h3 style="margin: 0 0 5px 0; font-size: 16px; font-weight: 600; cursor: move;" id="audio-describer-header">🎙️ Audio Describer (Drag to Move)</h3>
260                <div id="audio-describer-status" style="font-size: 12px; opacity: 0.9; margin-top: 5px;">Ready</div>
261            </div>
262            
263            <button id="audio-describer-toggle" style="
264                width: 100%;
265                padding: 12px;
266                background: white;
267                color: #667eea;
268                border: none;
269                border-radius: 8px;
270                font-size: 14px;
271                font-weight: 600;
272                cursor: pointer;
273                margin-bottom: 12px;
274                transition: all 0.3s ease;
275            ">Start Describing</button>
276            
277            <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid rgba(255,255,255,0.2);">
278                <label style="display: block; font-size: 12px; margin-bottom: 8px; opacity: 0.9;">
279                    Description Interval: <span id="interval-value">${CONFIG.captureInterval / 1000}s</span>
280                </label>
281                <input type="range" id="interval-slider" min="5" max="15" value="${CONFIG.captureInterval / 1000}" 
282                    style="width: 100%; cursor: pointer;">
283                
284                <label style="display: block; font-size: 12px; margin: 12px 0 8px 0; opacity: 0.9;">
285                    Speech Speed: <span id="speed-value">${CONFIG.speechRate}x</span>
286                </label>
287                <input type="range" id="speed-slider" min="0.5" max="2" step="0.1" value="${CONFIG.speechRate}" 
288                    style="width: 100%; cursor: pointer;">
289            </div>
290            
291            <div style="margin-top: 15px; font-size: 11px; opacity: 0.7; line-height: 1.4;">
292                This extension captures video frames and provides AI-powered audio descriptions for accessibility.
293            </div>
294        `;
295
296        document.body.appendChild(panel);
297
298        // Make panel draggable
299        makeDraggable(panel);
300
301        // Add event listeners
302        const toggleButton = document.getElementById('audio-describer-toggle');
303        toggleButton.addEventListener('click', () => {
304            if (isDescribing) {
305                stopDescribing();
306            } else {
307                startDescribing();
308            }
309        });
310
311        // Interval slider
312        const intervalSlider = document.getElementById('interval-slider');
313        const intervalValue = document.getElementById('interval-value');
314        intervalSlider.addEventListener('input', (e) => {
315            const value = parseInt(e.target.value);
316            CONFIG.captureInterval = value * 1000;
317            intervalValue.textContent = value + 's';
318            
319            // Restart interval if currently describing
320            if (isDescribing && captureIntervalId) {
321                clearInterval(captureIntervalId);
322                captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
323                updateStatus('Active - Describing every ' + value + ' seconds');
324            }
325        });
326
327        // Speed slider
328        const speedSlider = document.getElementById('speed-slider');
329        const speedValue = document.getElementById('speed-value');
330        speedSlider.addEventListener('input', (e) => {
331            const value = parseFloat(e.target.value);
332            CONFIG.speechRate = value;
333            speedValue.textContent = value.toFixed(1) + 'x';
334        });
335
336        console.log('Control panel created');
337    }
338
339    // Make element draggable
340    function makeDraggable(element) {
341        let isDragging = false;
342        let currentX;
343        let currentY;
344        let initialX;
345        let initialY;
346        let xOffset = 0;
347        let yOffset = 0;
348
349        const header = element.querySelector('#audio-describer-header');
350        
351        header.addEventListener('mousedown', dragStart);
352        document.addEventListener('mousemove', drag);
353        document.addEventListener('mouseup', dragEnd);
354
355        function dragStart(e) {
356            // Don't drag if clicking on buttons or inputs
357            if (e.target.tagName === 'BUTTON' || e.target.tagName === 'INPUT') {
358                return;
359            }
360
361            initialX = e.clientX - xOffset;
362            initialY = e.clientY - yOffset;
363
364            if (e.target === header || e.target.parentElement === header) {
365                isDragging = true;
366            }
367        }
368
369        function drag(e) {
370            if (isDragging) {
371                e.preventDefault();
372                
373                currentX = e.clientX - initialX;
374                currentY = e.clientY - initialY;
375
376                xOffset = currentX;
377                yOffset = currentY;
378
379                setTranslate(currentX, currentY, element);
380            }
381        }
382
383        function dragEnd() {
384            initialX = currentX;
385            initialY = currentY;
386            isDragging = false;
387        }
388
389        function setTranslate(xPos, yPos, el) {
390            el.style.transform = `translate3d(${xPos}px, ${yPos}px, 0)`;
391        }
392    }
393
394    // Update control panel state
395    function updateControlPanel() {
396        const toggleButton = document.getElementById('audio-describer-toggle');
397        if (toggleButton) {
398            if (isDescribing) {
399                toggleButton.textContent = 'Stop Describing';
400                toggleButton.style.background = '#ff4757';
401                toggleButton.style.color = 'white';
402            } else {
403                toggleButton.textContent = 'Start Describing';
404                toggleButton.style.background = 'white';
405                toggleButton.style.color = '#667eea';
406            }
407        }
408    }
409
410    // Detect navigation and reset video element
411    function handleNavigation() {
412        const newUrl = window.location.href;
413        
414        // Check if URL changed (navigation to new video)
415        if (newUrl !== currentVideoUrl) {
416            console.log('Navigation detected, URL changed from', currentVideoUrl, 'to', newUrl);
417            currentVideoUrl = newUrl;
418            
419            // If currently describing, stop and restart with new video
420            if (isDescribing) {
421                console.log('Restarting descriptions for new video...');
422                stopDescribing();
423                
424                // Wait for new video element to load
425                setTimeout(() => {
426                    videoElement = document.querySelector('video.html5-main-video');
427                    if (videoElement) {
428                        console.log('New video element found, restarting descriptions');
429                        startDescribing();
430                    } else {
431                        console.log('No video element found after navigation');
432                        updateStatus('No video found - navigate to a video');
433                    }
434                }, 2000);
435            } else {
436                // Just reset the video element reference
437                videoElement = null;
438            }
439        }
440    }
441
442    // Initialize extension
443    function init() {
444        console.log('YouTube Live Stream Audio Describer initialized');
445        
446        // Store initial URL
447        currentVideoUrl = window.location.href;
448        
449        // Wait for page to be ready
450        if (document.readyState === 'loading') {
451            document.addEventListener('DOMContentLoaded', createControlPanel);
452        } else {
453            createControlPanel();
454        }
455
456        // Re-create panel on navigation (YouTube is a SPA)
457        const observer = new MutationObserver(debounce(() => {
458            if (!document.getElementById('audio-describer-panel')) {
459                createControlPanel();
460            }
461        }, 1000));
462
463        observer.observe(document.body, {
464            childList: true,
465            subtree: true
466        });
467
468        // Detect URL changes for navigation (YouTube SPA)
469        let lastUrl = window.location.href;
470        new MutationObserver(() => {
471            const currentUrl = window.location.href;
472            if (currentUrl !== lastUrl) {
473                lastUrl = currentUrl;
474                handleNavigation();
475            }
476        }).observe(document.querySelector('title'), {
477            childList: true,
478            subtree: true
479        });
480
481        // Also listen to popstate for back/forward navigation
482        window.addEventListener('popstate', handleNavigation);
483
484        // Keyboard shortcut: Alt+D to toggle
485        document.addEventListener('keydown', (e) => {
486            if (e.altKey && e.key === 'd') {
487                e.preventDefault();
488                if (isDescribing) {
489                    stopDescribing();
490                } else {
491                    startDescribing();
492                }
493            }
494        });
495
496        console.log('Keyboard shortcut: Alt+D to toggle audio descriptions');
497    }
498
499    // Start the extension
500    init();
501})();