YouTube Live Stream Audio Describer

Real-time AI-powered audio descriptions for YouTube live streams to assist blind users

Size

17.7 KB

Version

1.1.3

Created

Mar 12, 2026

Updated

about 1 month ago

1// ==UserScript==
2// @name		YouTube Live Stream Audio Describer
3// @description		Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
4// @version		1.1.3
5// @match		https://*.youtube.com/*
6// @icon		https://www.youtube.com/s/desktop/38f8912f/img/favicon_32x32.png
7// ==/UserScript==
8(function() {
9    'use strict';
10
11    // Configuration
12    const CONFIG = {
13        captureInterval: 8000, // Capture frame every 8 seconds
14        maxDescriptionLength: 150,
15        speechRate: 1.0,
16        speechVolume: 1.0,
17        speechPitch: 1.0
18    };
19
20    // State management
21    let isDescribing = false;
22    let captureIntervalId = null;
23    let lastDescription = '';
24    let videoElement = null;
25    let isSpeaking = false;
26    let descriptionQueue = [];
27
28    // Utility: Debounce function
29    function debounce(func, wait) {
30        let timeout;
31        return function executedFunction(...args) {
32            const later = () => {
33                clearTimeout(timeout);
34                func(...args);
35            };
36            clearTimeout(timeout);
37            timeout = setTimeout(later, wait);
38        };
39    }
40
41    // Capture video frame as base64 image
42    function captureVideoFrame(video) {
43        try {
44            const canvas = document.createElement('canvas');
45            canvas.width = video.videoWidth || 640;
46            canvas.height = video.videoHeight || 360;
47            
48            const ctx = canvas.getContext('2d');
49            ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
50            
51            // Convert to base64 with reduced quality for faster processing
52            const dataUrl = canvas.toDataURL('image/jpeg', 0.7);
53            console.log('Frame captured successfully, size:', canvas.width, 'x', canvas.height);
54            return dataUrl;
55        } catch (error) {
56            console.error('Error capturing video frame:', error);
57            return null;
58        }
59    }
60
61    // Analyze frame with AI
62    async function analyzeFrame(frameDataUrl) {
63        try {
64            console.log('Analyzing video context...');
65            
66            // Get video and page context
67            const videoTitle = document.querySelector('h1.ytd-video-primary-info-renderer, h1.style-scope.ytd-watch-metadata')?.textContent?.trim() || 'Unknown video';
68            const channelName = document.querySelector('ytd-channel-name a, #channel-name a')?.textContent?.trim() || 'Unknown channel';
69            const viewCount = document.querySelector('.view-count, .ytd-video-view-count-renderer')?.textContent?.trim() || '';
70            const isLive = document.querySelector('.ytp-live-badge, .badge-style-type-live-now') !== null;
71            
72            // Get current video time and duration for context
73            const currentTime = videoElement?.currentTime || 0;
74            const duration = videoElement?.duration || 0;
75            const timeInfo = duration > 0 ? `${Math.floor(currentTime / 60)}:${Math.floor(currentTime % 60).toString().padStart(2, '0')} of ${Math.floor(duration / 60)}:${Math.floor(duration % 60).toString().padStart(2, '0')}` : '';
76            
77            // Try to get captions/subtitles if available
78            const captionText = document.querySelector('.ytp-caption-segment')?.textContent?.trim() || '';
79            
80            // Try to get live chat messages for live streams
81            let chatMessages = '';
82            if (isLive) {
83                const chatItems = Array.from(document.querySelectorAll('yt-live-chat-text-message-renderer')).slice(-3);
84                if (chatItems.length > 0) {
85                    chatMessages = chatItems.map(item => {
86                        const author = item.querySelector('#author-name')?.textContent?.trim() || '';
87                        const message = item.querySelector('#message')?.textContent?.trim() || '';
88                        return `${author}: ${message}`;
89                    }).join('. ');
90                }
91            }
92            
93            console.log('Video context:', { videoTitle, channelName, viewCount, isLive, timeInfo, captionText, chatMessages });
94            
95            let prompt = `You are providing real-time audio descriptions for a YouTube ${isLive ? 'live stream' : 'video'} to assist a blind viewer.
96
97Video Title: "${videoTitle}"
98Channel: "${channelName}"
99${viewCount ? `Views: ${viewCount}` : ''}
100Status: ${isLive ? 'LIVE STREAM' : 'Recorded video'}
101${timeInfo ? `Time: ${timeInfo}` : ''}
102${captionText ? `Current caption: "${captionText}"` : ''}
103${chatMessages ? `Recent chat: ${chatMessages}` : ''}
104
105Provide a brief, natural audio description (under ${CONFIG.maxDescriptionLength} words) that helps a blind person understand what's happening right now. ${isLive ? 'Focus on the live nature of the content and any chat activity.' : 'Focus on the current moment in the video.'} ${captionText ? 'Use the caption to provide context about what is being said or shown.' : ''} Make it conversational and helpful for text-to-speech. Vary your descriptions to keep them fresh and informative.`;
106
107            console.log('Sending prompt to AI...');
108            const description = await RM.aiCall(prompt);
109            console.log('AI analysis complete:', description);
110            return description;
111        } catch (error) {
112            console.error('Error analyzing with AI:', error);
113            console.error('Error details:', error.message, error.stack);
114            
115            // Provide a fallback description based on available metadata
116            const videoTitle = document.querySelector('h1.ytd-video-primary-info-renderer, h1.style-scope.ytd-watch-metadata')?.textContent?.trim();
117            const captionText = document.querySelector('.ytp-caption-segment')?.textContent?.trim();
118            
119            if (captionText) {
120                return `Caption: ${captionText}`;
121            } else if (videoTitle) {
122                return `Currently watching: ${videoTitle}`;
123            }
124            return null;
125        }
126    }
127
128    // Speak description using Web Speech API
129    function speakDescription(text) {
130        if (!text || text === lastDescription) {
131            console.log('Skipping duplicate or empty description');
132            return;
133        }
134
135        // Add to queue
136        descriptionQueue.push(text);
137        lastDescription = text;
138        
139        // Process queue if not already speaking
140        if (!isSpeaking) {
141            processDescriptionQueue();
142        }
143    }
144
145    // Process description queue
146    function processDescriptionQueue() {
147        if (descriptionQueue.length === 0) {
148            isSpeaking = false;
149            return;
150        }
151
152        isSpeaking = true;
153        const text = descriptionQueue.shift();
154
155        if ('speechSynthesis' in window) {
156            // Cancel any ongoing speech
157            window.speechSynthesis.cancel();
158
159            const utterance = new SpeechSynthesisUtterance(text);
160            utterance.rate = CONFIG.speechRate;
161            utterance.volume = CONFIG.speechVolume;
162            utterance.pitch = CONFIG.speechPitch;
163            utterance.lang = 'en-US';
164
165            utterance.onend = () => {
166                console.log('Finished speaking description');
167                // Process next in queue after a short delay
168                setTimeout(() => processDescriptionQueue(), 500);
169            };
170
171            utterance.onerror = (event) => {
172                console.error('Speech synthesis error:', event);
173                isSpeaking = false;
174                processDescriptionQueue();
175            };
176
177            console.log('Speaking:', text);
178            window.speechSynthesis.speak(utterance);
179            
180            // Update status in control panel
181            updateStatus('Describing...');
182        } else {
183            console.error('Speech synthesis not supported');
184            isSpeaking = false;
185        }
186    }
187
188    // Main capture and describe loop
189    async function captureAndDescribe() {
190        if (!isDescribing || !videoElement) {
191            return;
192        }
193
194        console.log('Starting frame capture and analysis...');
195        updateStatus('Capturing frame...');
196
197        const frameData = captureVideoFrame(videoElement);
198        if (!frameData) {
199            console.error('Failed to capture frame');
200            updateStatus('Error: Failed to capture frame');
201            return;
202        }
203
204        updateStatus('Analyzing with AI...');
205        const description = await analyzeFrame(frameData);
206        
207        if (description) {
208            speakDescription(description);
209        } else {
210            updateStatus('Error: AI analysis failed');
211        }
212    }
213
214    // Start describing
215    async function startDescribing() {
216        if (isDescribing) {
217            console.log('Already describing');
218            return;
219        }
220
221        videoElement = document.querySelector('video.html5-main-video');
222        if (!videoElement) {
223            alert('No video found on this page. Please navigate to a YouTube video or live stream.');
224            return;
225        }
226
227        console.log('Starting audio descriptions...');
228        isDescribing = true;
229        updateControlPanel();
230        
231        // Initial description
232        await captureAndDescribe();
233        
234        // Set up interval for continuous descriptions
235        captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
236        
237        updateStatus('Active - Describing every ' + (CONFIG.captureInterval / 1000) + ' seconds');
238    }
239
240    // Stop describing
241    function stopDescribing() {
242        console.log('Stopping audio descriptions...');
243        isDescribing = false;
244        
245        if (captureIntervalId) {
246            clearInterval(captureIntervalId);
247            captureIntervalId = null;
248        }
249
250        // Stop any ongoing speech
251        if ('speechSynthesis' in window) {
252            window.speechSynthesis.cancel();
253        }
254        
255        // Clear queue
256        descriptionQueue = [];
257        isSpeaking = false;
258        
259        updateControlPanel();
260        updateStatus('Stopped');
261    }
262
263    // Update status display
264    function updateStatus(message) {
265        const statusElement = document.getElementById('audio-describer-status');
266        if (statusElement) {
267            statusElement.textContent = message;
268        }
269    }
270
271    // Create control panel UI
272    function createControlPanel() {
273        // Remove existing panel if any
274        const existing = document.getElementById('audio-describer-panel');
275        if (existing) {
276            existing.remove();
277        }
278
279        const panel = document.createElement('div');
280        panel.id = 'audio-describer-panel';
281        panel.style.cssText = `
282            position: fixed;
283            top: 80px;
284            right: 20px;
285            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
286            color: white;
287            padding: 20px;
288            border-radius: 12px;
289            box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
290            z-index: 10000;
291            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
292            min-width: 280px;
293            backdrop-filter: blur(10px);
294            cursor: move;
295        `;
296
297        panel.innerHTML = `
298            <div style="margin-bottom: 15px;">
299                <h3 style="margin: 0 0 5px 0; font-size: 16px; font-weight: 600;">🎙️ Audio Describer</h3>
300                <div id="audio-describer-status" style="font-size: 12px; opacity: 0.9; margin-top: 5px;">Ready</div>
301            </div>
302            
303            <button id="audio-describer-toggle" style="
304                width: 100%;
305                padding: 12px;
306                background: white;
307                color: #667eea;
308                border: none;
309                border-radius: 8px;
310                font-size: 14px;
311                font-weight: 600;
312                cursor: pointer;
313                margin-bottom: 12px;
314                transition: all 0.3s ease;
315            ">Start Describing</button>
316            
317            <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid rgba(255,255,255,0.2);">
318                <label style="display: block; font-size: 12px; margin-bottom: 8px; opacity: 0.9;">
319                    Description Interval: <span id="interval-value">${CONFIG.captureInterval / 1000}s</span>
320                </label>
321                <input type="range" id="interval-slider" min="5" max="15" value="${CONFIG.captureInterval / 1000}" 
322                    style="width: 100%; cursor: pointer;">
323                
324                <label style="display: block; font-size: 12px; margin: 12px 0 8px 0; opacity: 0.9;">
325                    Speech Speed: <span id="speed-value">${CONFIG.speechRate}x</span>
326                </label>
327                <input type="range" id="speed-slider" min="0.5" max="2" step="0.1" value="${CONFIG.speechRate}" 
328                    style="width: 100%; cursor: pointer;">
329            </div>
330            
331            <div style="margin-top: 15px; font-size: 11px; opacity: 0.7; line-height: 1.4;">
332                This extension captures video frames and provides AI-powered audio descriptions for accessibility.
333            </div>
334        `;
335
336        document.body.appendChild(panel);
337
338        // Make panel draggable
339        let isDragging = false;
340        let currentX;
341        let currentY;
342        let initialX;
343        let initialY;
344        let xOffset = 0;
345        let yOffset = 0;
346
347        panel.addEventListener('mousedown', dragStart);
348        document.addEventListener('mousemove', drag);
349        document.addEventListener('mouseup', dragEnd);
350
351        function dragStart(e) {
352            // Don't drag if clicking on interactive elements
353            if (e.target.tagName === 'BUTTON' || e.target.tagName === 'INPUT') {
354                return;
355            }
356
357            initialX = e.clientX - xOffset;
358            initialY = e.clientY - yOffset;
359
360            if (e.target === panel || e.target.closest('#audio-describer-panel')) {
361                isDragging = true;
362            }
363        }
364
365        function drag(e) {
366            if (isDragging) {
367                e.preventDefault();
368                
369                currentX = e.clientX - initialX;
370                currentY = e.clientY - initialY;
371
372                xOffset = currentX;
373                yOffset = currentY;
374
375                setTranslate(currentX, currentY, panel);
376            }
377        }
378
379        function dragEnd(e) {
380            initialX = currentX;
381            initialY = currentY;
382            isDragging = false;
383        }
384
385        function setTranslate(xPos, yPos, el) {
386            el.style.transform = `translate3d(${xPos}px, ${yPos}px, 0)`;
387        }
388
389        // Add event listeners
390        const toggleButton = document.getElementById('audio-describer-toggle');
391        toggleButton.addEventListener('click', () => {
392            if (isDescribing) {
393                stopDescribing();
394            } else {
395                startDescribing();
396            }
397        });
398
399        // Interval slider
400        const intervalSlider = document.getElementById('interval-slider');
401        const intervalValue = document.getElementById('interval-value');
402        intervalSlider.addEventListener('input', (e) => {
403            const value = parseInt(e.target.value);
404            CONFIG.captureInterval = value * 1000;
405            intervalValue.textContent = value + 's';
406            
407            // Restart interval if currently describing
408            if (isDescribing && captureIntervalId) {
409                clearInterval(captureIntervalId);
410                captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
411                updateStatus('Active - Describing every ' + value + ' seconds');
412            }
413        });
414
415        // Speed slider
416        const speedSlider = document.getElementById('speed-slider');
417        const speedValue = document.getElementById('speed-value');
418        speedSlider.addEventListener('input', (e) => {
419            const value = parseFloat(e.target.value);
420            CONFIG.speechRate = value;
421            speedValue.textContent = value.toFixed(1) + 'x';
422        });
423
424        console.log('Control panel created');
425    }
426
427    // Update control panel state
428    function updateControlPanel() {
429        const toggleButton = document.getElementById('audio-describer-toggle');
430        if (toggleButton) {
431            if (isDescribing) {
432                toggleButton.textContent = 'Stop Describing';
433                toggleButton.style.background = '#ff4757';
434                toggleButton.style.color = 'white';
435            } else {
436                toggleButton.textContent = 'Start Describing';
437                toggleButton.style.background = 'white';
438                toggleButton.style.color = '#667eea';
439            }
440        }
441    }
442
443    // Initialize extension
444    function init() {
445        console.log('YouTube Live Stream Audio Describer initialized');
446        
447        // Wait for page to be ready
448        if (document.readyState === 'loading') {
449            document.addEventListener('DOMContentLoaded', createControlPanel);
450        } else {
451            createControlPanel();
452        }
453
454        // Re-create panel on navigation (YouTube is a SPA)
455        const observer = new MutationObserver(debounce(() => {
456            if (!document.getElementById('audio-describer-panel')) {
457                createControlPanel();
458            }
459        }, 1000));
460
461        observer.observe(document.body, {
462            childList: true,
463            subtree: true
464        });
465
466        // Keyboard shortcut: Alt+D to toggle
467        document.addEventListener('keydown', (e) => {
468            if (e.altKey && e.key === 'd') {
469                e.preventDefault();
470                if (isDescribing) {
471                    stopDescribing();
472                } else {
473                    startDescribing();
474                }
475            }
476        });
477
478        console.log('Keyboard shortcut: Alt+D to toggle audio descriptions');
479    }
480
481    // Start the extension
482    init();
483})();