Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
Size
17.7 KB
Version
1.1.3
Created
Mar 12, 2026
Updated
about 1 month ago
1// ==UserScript==
2// @name YouTube Live Stream Audio Describer
3// @description Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
4// @version 1.1.3
5// @match https://*.youtube.com/*
6// @icon https://www.youtube.com/s/desktop/38f8912f/img/favicon_32x32.png
7// ==/UserScript==
8(function() {
9 'use strict';
10
11 // Configuration
12 const CONFIG = {
13 captureInterval: 8000, // Capture frame every 8 seconds
14 maxDescriptionLength: 150,
15 speechRate: 1.0,
16 speechVolume: 1.0,
17 speechPitch: 1.0
18 };
19
20 // State management
21 let isDescribing = false;
22 let captureIntervalId = null;
23 let lastDescription = '';
24 let videoElement = null;
25 let isSpeaking = false;
26 let descriptionQueue = [];
27
28 // Utility: Debounce function
29 function debounce(func, wait) {
30 let timeout;
31 return function executedFunction(...args) {
32 const later = () => {
33 clearTimeout(timeout);
34 func(...args);
35 };
36 clearTimeout(timeout);
37 timeout = setTimeout(later, wait);
38 };
39 }
40
41 // Capture video frame as base64 image
42 function captureVideoFrame(video) {
43 try {
44 const canvas = document.createElement('canvas');
45 canvas.width = video.videoWidth || 640;
46 canvas.height = video.videoHeight || 360;
47
48 const ctx = canvas.getContext('2d');
49 ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
50
51 // Convert to base64 with reduced quality for faster processing
52 const dataUrl = canvas.toDataURL('image/jpeg', 0.7);
53 console.log('Frame captured successfully, size:', canvas.width, 'x', canvas.height);
54 return dataUrl;
55 } catch (error) {
56 console.error('Error capturing video frame:', error);
57 return null;
58 }
59 }
60
61 // Analyze frame with AI
62 async function analyzeFrame(frameDataUrl) {
63 try {
64 console.log('Analyzing video context...');
65
66 // Get video and page context
67 const videoTitle = document.querySelector('h1.ytd-video-primary-info-renderer, h1.style-scope.ytd-watch-metadata')?.textContent?.trim() || 'Unknown video';
68 const channelName = document.querySelector('ytd-channel-name a, #channel-name a')?.textContent?.trim() || 'Unknown channel';
69 const viewCount = document.querySelector('.view-count, .ytd-video-view-count-renderer')?.textContent?.trim() || '';
70 const isLive = document.querySelector('.ytp-live-badge, .badge-style-type-live-now') !== null;
71
72 // Get current video time and duration for context
73 const currentTime = videoElement?.currentTime || 0;
74 const duration = videoElement?.duration || 0;
75 const timeInfo = duration > 0 ? `${Math.floor(currentTime / 60)}:${Math.floor(currentTime % 60).toString().padStart(2, '0')} of ${Math.floor(duration / 60)}:${Math.floor(duration % 60).toString().padStart(2, '0')}` : '';
76
77 // Try to get captions/subtitles if available
78 const captionText = document.querySelector('.ytp-caption-segment')?.textContent?.trim() || '';
79
80 // Try to get live chat messages for live streams
81 let chatMessages = '';
82 if (isLive) {
83 const chatItems = Array.from(document.querySelectorAll('yt-live-chat-text-message-renderer')).slice(-3);
84 if (chatItems.length > 0) {
85 chatMessages = chatItems.map(item => {
86 const author = item.querySelector('#author-name')?.textContent?.trim() || '';
87 const message = item.querySelector('#message')?.textContent?.trim() || '';
88 return `${author}: ${message}`;
89 }).join('. ');
90 }
91 }
92
93 console.log('Video context:', { videoTitle, channelName, viewCount, isLive, timeInfo, captionText, chatMessages });
94
95 let prompt = `You are providing real-time audio descriptions for a YouTube ${isLive ? 'live stream' : 'video'} to assist a blind viewer.
96
97Video Title: "${videoTitle}"
98Channel: "${channelName}"
99${viewCount ? `Views: ${viewCount}` : ''}
100Status: ${isLive ? 'LIVE STREAM' : 'Recorded video'}
101${timeInfo ? `Time: ${timeInfo}` : ''}
102${captionText ? `Current caption: "${captionText}"` : ''}
103${chatMessages ? `Recent chat: ${chatMessages}` : ''}
104
105Provide a brief, natural audio description (under ${CONFIG.maxDescriptionLength} words) that helps a blind person understand what's happening right now. ${isLive ? 'Focus on the live nature of the content and any chat activity.' : 'Focus on the current moment in the video.'} ${captionText ? 'Use the caption to provide context about what is being said or shown.' : ''} Make it conversational and helpful for text-to-speech. Vary your descriptions to keep them fresh and informative.`;
106
107 console.log('Sending prompt to AI...');
108 const description = await RM.aiCall(prompt);
109 console.log('AI analysis complete:', description);
110 return description;
111 } catch (error) {
112 console.error('Error analyzing with AI:', error);
113 console.error('Error details:', error.message, error.stack);
114
115 // Provide a fallback description based on available metadata
116 const videoTitle = document.querySelector('h1.ytd-video-primary-info-renderer, h1.style-scope.ytd-watch-metadata')?.textContent?.trim();
117 const captionText = document.querySelector('.ytp-caption-segment')?.textContent?.trim();
118
119 if (captionText) {
120 return `Caption: ${captionText}`;
121 } else if (videoTitle) {
122 return `Currently watching: ${videoTitle}`;
123 }
124 return null;
125 }
126 }
127
128 // Speak description using Web Speech API
129 function speakDescription(text) {
130 if (!text || text === lastDescription) {
131 console.log('Skipping duplicate or empty description');
132 return;
133 }
134
135 // Add to queue
136 descriptionQueue.push(text);
137 lastDescription = text;
138
139 // Process queue if not already speaking
140 if (!isSpeaking) {
141 processDescriptionQueue();
142 }
143 }
144
145 // Process description queue
146 function processDescriptionQueue() {
147 if (descriptionQueue.length === 0) {
148 isSpeaking = false;
149 return;
150 }
151
152 isSpeaking = true;
153 const text = descriptionQueue.shift();
154
155 if ('speechSynthesis' in window) {
156 // Cancel any ongoing speech
157 window.speechSynthesis.cancel();
158
159 const utterance = new SpeechSynthesisUtterance(text);
160 utterance.rate = CONFIG.speechRate;
161 utterance.volume = CONFIG.speechVolume;
162 utterance.pitch = CONFIG.speechPitch;
163 utterance.lang = 'en-US';
164
165 utterance.onend = () => {
166 console.log('Finished speaking description');
167 // Process next in queue after a short delay
168 setTimeout(() => processDescriptionQueue(), 500);
169 };
170
171 utterance.onerror = (event) => {
172 console.error('Speech synthesis error:', event);
173 isSpeaking = false;
174 processDescriptionQueue();
175 };
176
177 console.log('Speaking:', text);
178 window.speechSynthesis.speak(utterance);
179
180 // Update status in control panel
181 updateStatus('Describing...');
182 } else {
183 console.error('Speech synthesis not supported');
184 isSpeaking = false;
185 }
186 }
187
188 // Main capture and describe loop
189 async function captureAndDescribe() {
190 if (!isDescribing || !videoElement) {
191 return;
192 }
193
194 console.log('Starting frame capture and analysis...');
195 updateStatus('Capturing frame...');
196
197 const frameData = captureVideoFrame(videoElement);
198 if (!frameData) {
199 console.error('Failed to capture frame');
200 updateStatus('Error: Failed to capture frame');
201 return;
202 }
203
204 updateStatus('Analyzing with AI...');
205 const description = await analyzeFrame(frameData);
206
207 if (description) {
208 speakDescription(description);
209 } else {
210 updateStatus('Error: AI analysis failed');
211 }
212 }
213
214 // Start describing
215 async function startDescribing() {
216 if (isDescribing) {
217 console.log('Already describing');
218 return;
219 }
220
221 videoElement = document.querySelector('video.html5-main-video');
222 if (!videoElement) {
223 alert('No video found on this page. Please navigate to a YouTube video or live stream.');
224 return;
225 }
226
227 console.log('Starting audio descriptions...');
228 isDescribing = true;
229 updateControlPanel();
230
231 // Initial description
232 await captureAndDescribe();
233
234 // Set up interval for continuous descriptions
235 captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
236
237 updateStatus('Active - Describing every ' + (CONFIG.captureInterval / 1000) + ' seconds');
238 }
239
240 // Stop describing
241 function stopDescribing() {
242 console.log('Stopping audio descriptions...');
243 isDescribing = false;
244
245 if (captureIntervalId) {
246 clearInterval(captureIntervalId);
247 captureIntervalId = null;
248 }
249
250 // Stop any ongoing speech
251 if ('speechSynthesis' in window) {
252 window.speechSynthesis.cancel();
253 }
254
255 // Clear queue
256 descriptionQueue = [];
257 isSpeaking = false;
258
259 updateControlPanel();
260 updateStatus('Stopped');
261 }
262
263 // Update status display
264 function updateStatus(message) {
265 const statusElement = document.getElementById('audio-describer-status');
266 if (statusElement) {
267 statusElement.textContent = message;
268 }
269 }
270
271 // Create control panel UI
272 function createControlPanel() {
273 // Remove existing panel if any
274 const existing = document.getElementById('audio-describer-panel');
275 if (existing) {
276 existing.remove();
277 }
278
279 const panel = document.createElement('div');
280 panel.id = 'audio-describer-panel';
281 panel.style.cssText = `
282 position: fixed;
283 top: 80px;
284 right: 20px;
285 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
286 color: white;
287 padding: 20px;
288 border-radius: 12px;
289 box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
290 z-index: 10000;
291 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
292 min-width: 280px;
293 backdrop-filter: blur(10px);
294 cursor: move;
295 `;
296
297 panel.innerHTML = `
298 <div style="margin-bottom: 15px;">
299 <h3 style="margin: 0 0 5px 0; font-size: 16px; font-weight: 600;">🎙️ Audio Describer</h3>
300 <div id="audio-describer-status" style="font-size: 12px; opacity: 0.9; margin-top: 5px;">Ready</div>
301 </div>
302
303 <button id="audio-describer-toggle" style="
304 width: 100%;
305 padding: 12px;
306 background: white;
307 color: #667eea;
308 border: none;
309 border-radius: 8px;
310 font-size: 14px;
311 font-weight: 600;
312 cursor: pointer;
313 margin-bottom: 12px;
314 transition: all 0.3s ease;
315 ">Start Describing</button>
316
317 <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid rgba(255,255,255,0.2);">
318 <label style="display: block; font-size: 12px; margin-bottom: 8px; opacity: 0.9;">
319 Description Interval: <span id="interval-value">${CONFIG.captureInterval / 1000}s</span>
320 </label>
321 <input type="range" id="interval-slider" min="5" max="15" value="${CONFIG.captureInterval / 1000}"
322 style="width: 100%; cursor: pointer;">
323
324 <label style="display: block; font-size: 12px; margin: 12px 0 8px 0; opacity: 0.9;">
325 Speech Speed: <span id="speed-value">${CONFIG.speechRate}x</span>
326 </label>
327 <input type="range" id="speed-slider" min="0.5" max="2" step="0.1" value="${CONFIG.speechRate}"
328 style="width: 100%; cursor: pointer;">
329 </div>
330
331 <div style="margin-top: 15px; font-size: 11px; opacity: 0.7; line-height: 1.4;">
332 This extension captures video frames and provides AI-powered audio descriptions for accessibility.
333 </div>
334 `;
335
336 document.body.appendChild(panel);
337
338 // Make panel draggable
339 let isDragging = false;
340 let currentX;
341 let currentY;
342 let initialX;
343 let initialY;
344 let xOffset = 0;
345 let yOffset = 0;
346
347 panel.addEventListener('mousedown', dragStart);
348 document.addEventListener('mousemove', drag);
349 document.addEventListener('mouseup', dragEnd);
350
351 function dragStart(e) {
352 // Don't drag if clicking on interactive elements
353 if (e.target.tagName === 'BUTTON' || e.target.tagName === 'INPUT') {
354 return;
355 }
356
357 initialX = e.clientX - xOffset;
358 initialY = e.clientY - yOffset;
359
360 if (e.target === panel || e.target.closest('#audio-describer-panel')) {
361 isDragging = true;
362 }
363 }
364
365 function drag(e) {
366 if (isDragging) {
367 e.preventDefault();
368
369 currentX = e.clientX - initialX;
370 currentY = e.clientY - initialY;
371
372 xOffset = currentX;
373 yOffset = currentY;
374
375 setTranslate(currentX, currentY, panel);
376 }
377 }
378
379 function dragEnd(e) {
380 initialX = currentX;
381 initialY = currentY;
382 isDragging = false;
383 }
384
385 function setTranslate(xPos, yPos, el) {
386 el.style.transform = `translate3d(${xPos}px, ${yPos}px, 0)`;
387 }
388
389 // Add event listeners
390 const toggleButton = document.getElementById('audio-describer-toggle');
391 toggleButton.addEventListener('click', () => {
392 if (isDescribing) {
393 stopDescribing();
394 } else {
395 startDescribing();
396 }
397 });
398
399 // Interval slider
400 const intervalSlider = document.getElementById('interval-slider');
401 const intervalValue = document.getElementById('interval-value');
402 intervalSlider.addEventListener('input', (e) => {
403 const value = parseInt(e.target.value);
404 CONFIG.captureInterval = value * 1000;
405 intervalValue.textContent = value + 's';
406
407 // Restart interval if currently describing
408 if (isDescribing && captureIntervalId) {
409 clearInterval(captureIntervalId);
410 captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
411 updateStatus('Active - Describing every ' + value + ' seconds');
412 }
413 });
414
415 // Speed slider
416 const speedSlider = document.getElementById('speed-slider');
417 const speedValue = document.getElementById('speed-value');
418 speedSlider.addEventListener('input', (e) => {
419 const value = parseFloat(e.target.value);
420 CONFIG.speechRate = value;
421 speedValue.textContent = value.toFixed(1) + 'x';
422 });
423
424 console.log('Control panel created');
425 }
426
427 // Update control panel state
428 function updateControlPanel() {
429 const toggleButton = document.getElementById('audio-describer-toggle');
430 if (toggleButton) {
431 if (isDescribing) {
432 toggleButton.textContent = 'Stop Describing';
433 toggleButton.style.background = '#ff4757';
434 toggleButton.style.color = 'white';
435 } else {
436 toggleButton.textContent = 'Start Describing';
437 toggleButton.style.background = 'white';
438 toggleButton.style.color = '#667eea';
439 }
440 }
441 }
442
443 // Initialize extension
444 function init() {
445 console.log('YouTube Live Stream Audio Describer initialized');
446
447 // Wait for page to be ready
448 if (document.readyState === 'loading') {
449 document.addEventListener('DOMContentLoaded', createControlPanel);
450 } else {
451 createControlPanel();
452 }
453
454 // Re-create panel on navigation (YouTube is a SPA)
455 const observer = new MutationObserver(debounce(() => {
456 if (!document.getElementById('audio-describer-panel')) {
457 createControlPanel();
458 }
459 }, 1000));
460
461 observer.observe(document.body, {
462 childList: true,
463 subtree: true
464 });
465
466 // Keyboard shortcut: Alt+D to toggle
467 document.addEventListener('keydown', (e) => {
468 if (e.altKey && e.key === 'd') {
469 e.preventDefault();
470 if (isDescribing) {
471 stopDescribing();
472 } else {
473 startDescribing();
474 }
475 }
476 });
477
478 console.log('Keyboard shortcut: Alt+D to toggle audio descriptions');
479 }
480
481 // Start the extension
482 init();
483})();