Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
Size
17.0 KB
Version
1.1.2
Created
Mar 12, 2026
Updated
about 1 month ago
1// ==UserScript==
2// @name YouTube Live Stream Audio Describer
3// @description Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
4// @version 1.1.2
5// @match https://*.youtube.com/*
6// @icon https://www.youtube.com/s/desktop/38f8912f/img/favicon_32x32.png
7// ==/UserScript==
8(function() {
9 'use strict';
10
11 // Configuration
12 const CONFIG = {
13 captureInterval: 8000, // Capture frame every 8 seconds
14 maxDescriptionLength: 150,
15 speechRate: 1.0,
16 speechVolume: 1.0,
17 speechPitch: 1.0
18 };
19
20 // State management
21 let isDescribing = false;
22 let captureIntervalId = null;
23 let lastDescription = '';
24 let videoElement = null;
25 let isSpeaking = false;
26 let descriptionQueue = [];
27 let currentVideoUrl = '';
28
29 // Utility: Debounce function
30 function debounce(func, wait) {
31 let timeout;
32 return function executedFunction(...args) {
33 const later = () => {
34 clearTimeout(timeout);
35 func(...args);
36 };
37 clearTimeout(timeout);
38 timeout = setTimeout(later, wait);
39 };
40 }
41
42 // Capture video frame as base64 image
43 function captureVideoFrame(video) {
44 try {
45 const canvas = document.createElement('canvas');
46 canvas.width = video.videoWidth || 640;
47 canvas.height = video.videoHeight || 360;
48
49 const ctx = canvas.getContext('2d');
50 ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
51
52 // Convert to base64 with reduced quality for faster processing
53 const dataUrl = canvas.toDataURL('image/jpeg', 0.7);
54 console.log('Frame captured successfully, size:', canvas.width, 'x', canvas.height);
55 return dataUrl;
56 } catch (error) {
57 console.error('Error capturing video frame:', error);
58 return null;
59 }
60 }
61
62 // Analyze frame with AI
63 async function analyzeFrame(frameDataUrl) {
64 try {
65 console.log('Sending frame to AI for analysis...');
66
67 const prompt = `You are an audio describer for a blind person watching a YouTube video. Describe ONLY what you see happening in this specific video frame. Be concise and focus on:
68
691. What ACTION is currently happening (people talking, moving, demonstrating something)
702. Any TEXT visible on screen (titles, captions, graphics)
713. The main SUBJECT or focus of the frame
724. Any CHANGES in the scene (new person, location change, object being shown)
73
74Keep it under ${CONFIG.maxDescriptionLength} words. Be direct and specific about what's ACTUALLY in the frame right now.
75
76Analyze this video frame: ${frameDataUrl}`;
77
78 const description = await RM.aiCall(prompt);
79 console.log('AI analysis complete:', description);
80 return description;
81 } catch (error) {
82 console.error('Error analyzing frame with AI:', error);
83 return null;
84 }
85 }
86
87 // Speak description using Web Speech API
88 function speakDescription(text) {
89 if (!text || text === lastDescription) {
90 console.log('Skipping duplicate or empty description');
91 return;
92 }
93
94 // Add to queue
95 descriptionQueue.push(text);
96 lastDescription = text;
97
98 // Process queue if not already speaking
99 if (!isSpeaking) {
100 processDescriptionQueue();
101 }
102 }
103
104 // Process description queue
105 function processDescriptionQueue() {
106 if (descriptionQueue.length === 0) {
107 isSpeaking = false;
108 return;
109 }
110
111 isSpeaking = true;
112 const text = descriptionQueue.shift();
113
114 if ('speechSynthesis' in window) {
115 // Cancel any ongoing speech
116 window.speechSynthesis.cancel();
117
118 const utterance = new SpeechSynthesisUtterance(text);
119 utterance.rate = CONFIG.speechRate;
120 utterance.volume = CONFIG.speechVolume;
121 utterance.pitch = CONFIG.speechPitch;
122 utterance.lang = 'en-US';
123
124 utterance.onend = () => {
125 console.log('Finished speaking description');
126 // Process next in queue after a short delay
127 setTimeout(() => processDescriptionQueue(), 500);
128 };
129
130 utterance.onerror = (event) => {
131 console.error('Speech synthesis error:', event);
132 isSpeaking = false;
133 processDescriptionQueue();
134 };
135
136 console.log('Speaking:', text);
137 window.speechSynthesis.speak(utterance);
138
139 // Update status in control panel
140 updateStatus('Describing...');
141 } else {
142 console.error('Speech synthesis not supported');
143 isSpeaking = false;
144 }
145 }
146
147 // Main capture and describe loop
148 async function captureAndDescribe() {
149 if (!isDescribing || !videoElement) {
150 return;
151 }
152
153 console.log('Starting frame capture and analysis...');
154 updateStatus('Capturing frame...');
155
156 const frameData = captureVideoFrame(videoElement);
157 if (!frameData) {
158 console.error('Failed to capture frame');
159 updateStatus('Error: Failed to capture frame');
160 return;
161 }
162
163 updateStatus('Analyzing with AI...');
164 const description = await analyzeFrame(frameData);
165
166 if (description) {
167 speakDescription(description);
168 } else {
169 updateStatus('Error: AI analysis failed');
170 }
171 }
172
173 // Start describing
174 async function startDescribing() {
175 if (isDescribing) {
176 console.log('Already describing');
177 return;
178 }
179
180 videoElement = document.querySelector('video.html5-main-video');
181 if (!videoElement) {
182 alert('No video found on this page. Please navigate to a YouTube video or live stream.');
183 return;
184 }
185
186 console.log('Starting audio descriptions...');
187 isDescribing = true;
188 updateControlPanel();
189
190 // Initial description
191 await captureAndDescribe();
192
193 // Set up interval for continuous descriptions
194 captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
195
196 updateStatus('Active - Describing every ' + (CONFIG.captureInterval / 1000) + ' seconds');
197 }
198
199 // Stop describing
200 function stopDescribing() {
201 console.log('Stopping audio descriptions...');
202 isDescribing = false;
203
204 if (captureIntervalId) {
205 clearInterval(captureIntervalId);
206 captureIntervalId = null;
207 }
208
209 // Stop any ongoing speech
210 if ('speechSynthesis' in window) {
211 window.speechSynthesis.cancel();
212 }
213
214 // Clear queue
215 descriptionQueue = [];
216 isSpeaking = false;
217 lastDescription = '';
218
219 updateControlPanel();
220 updateStatus('Stopped');
221 }
222
223 // Update status display
224 function updateStatus(message) {
225 const statusElement = document.getElementById('audio-describer-status');
226 if (statusElement) {
227 statusElement.textContent = message;
228 }
229 }
230
231 // Create control panel UI
232 function createControlPanel() {
233 // Remove existing panel if any
234 const existing = document.getElementById('audio-describer-panel');
235 if (existing) {
236 existing.remove();
237 }
238
239 const panel = document.createElement('div');
240 panel.id = 'audio-describer-panel';
241 panel.style.cssText = `
242 position: fixed;
243 top: 80px;
244 right: 20px;
245 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
246 color: white;
247 padding: 20px;
248 border-radius: 12px;
249 box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
250 z-index: 10000;
251 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
252 min-width: 280px;
253 backdrop-filter: blur(10px);
254 cursor: move;
255 `;
256
257 panel.innerHTML = `
258 <div style="margin-bottom: 15px;">
259 <h3 style="margin: 0 0 5px 0; font-size: 16px; font-weight: 600; cursor: move;" id="audio-describer-header">🎙️ Audio Describer (Drag to Move)</h3>
260 <div id="audio-describer-status" style="font-size: 12px; opacity: 0.9; margin-top: 5px;">Ready</div>
261 </div>
262
263 <button id="audio-describer-toggle" style="
264 width: 100%;
265 padding: 12px;
266 background: white;
267 color: #667eea;
268 border: none;
269 border-radius: 8px;
270 font-size: 14px;
271 font-weight: 600;
272 cursor: pointer;
273 margin-bottom: 12px;
274 transition: all 0.3s ease;
275 ">Start Describing</button>
276
277 <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid rgba(255,255,255,0.2);">
278 <label style="display: block; font-size: 12px; margin-bottom: 8px; opacity: 0.9;">
279 Description Interval: <span id="interval-value">${CONFIG.captureInterval / 1000}s</span>
280 </label>
281 <input type="range" id="interval-slider" min="5" max="15" value="${CONFIG.captureInterval / 1000}"
282 style="width: 100%; cursor: pointer;">
283
284 <label style="display: block; font-size: 12px; margin: 12px 0 8px 0; opacity: 0.9;">
285 Speech Speed: <span id="speed-value">${CONFIG.speechRate}x</span>
286 </label>
287 <input type="range" id="speed-slider" min="0.5" max="2" step="0.1" value="${CONFIG.speechRate}"
288 style="width: 100%; cursor: pointer;">
289 </div>
290
291 <div style="margin-top: 15px; font-size: 11px; opacity: 0.7; line-height: 1.4;">
292 This extension captures video frames and provides AI-powered audio descriptions for accessibility.
293 </div>
294 `;
295
296 document.body.appendChild(panel);
297
298 // Make panel draggable
299 makeDraggable(panel);
300
301 // Add event listeners
302 const toggleButton = document.getElementById('audio-describer-toggle');
303 toggleButton.addEventListener('click', () => {
304 if (isDescribing) {
305 stopDescribing();
306 } else {
307 startDescribing();
308 }
309 });
310
311 // Interval slider
312 const intervalSlider = document.getElementById('interval-slider');
313 const intervalValue = document.getElementById('interval-value');
314 intervalSlider.addEventListener('input', (e) => {
315 const value = parseInt(e.target.value);
316 CONFIG.captureInterval = value * 1000;
317 intervalValue.textContent = value + 's';
318
319 // Restart interval if currently describing
320 if (isDescribing && captureIntervalId) {
321 clearInterval(captureIntervalId);
322 captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
323 updateStatus('Active - Describing every ' + value + ' seconds');
324 }
325 });
326
327 // Speed slider
328 const speedSlider = document.getElementById('speed-slider');
329 const speedValue = document.getElementById('speed-value');
330 speedSlider.addEventListener('input', (e) => {
331 const value = parseFloat(e.target.value);
332 CONFIG.speechRate = value;
333 speedValue.textContent = value.toFixed(1) + 'x';
334 });
335
336 console.log('Control panel created');
337 }
338
339 // Make element draggable
340 function makeDraggable(element) {
341 let isDragging = false;
342 let currentX;
343 let currentY;
344 let initialX;
345 let initialY;
346 let xOffset = 0;
347 let yOffset = 0;
348
349 const header = element.querySelector('#audio-describer-header');
350
351 header.addEventListener('mousedown', dragStart);
352 document.addEventListener('mousemove', drag);
353 document.addEventListener('mouseup', dragEnd);
354
355 function dragStart(e) {
356 // Don't drag if clicking on buttons or inputs
357 if (e.target.tagName === 'BUTTON' || e.target.tagName === 'INPUT') {
358 return;
359 }
360
361 initialX = e.clientX - xOffset;
362 initialY = e.clientY - yOffset;
363
364 if (e.target === header || e.target.parentElement === header) {
365 isDragging = true;
366 }
367 }
368
369 function drag(e) {
370 if (isDragging) {
371 e.preventDefault();
372
373 currentX = e.clientX - initialX;
374 currentY = e.clientY - initialY;
375
376 xOffset = currentX;
377 yOffset = currentY;
378
379 setTranslate(currentX, currentY, element);
380 }
381 }
382
383 function dragEnd() {
384 initialX = currentX;
385 initialY = currentY;
386 isDragging = false;
387 }
388
389 function setTranslate(xPos, yPos, el) {
390 el.style.transform = `translate3d(${xPos}px, ${yPos}px, 0)`;
391 }
392 }
393
394 // Update control panel state
395 function updateControlPanel() {
396 const toggleButton = document.getElementById('audio-describer-toggle');
397 if (toggleButton) {
398 if (isDescribing) {
399 toggleButton.textContent = 'Stop Describing';
400 toggleButton.style.background = '#ff4757';
401 toggleButton.style.color = 'white';
402 } else {
403 toggleButton.textContent = 'Start Describing';
404 toggleButton.style.background = 'white';
405 toggleButton.style.color = '#667eea';
406 }
407 }
408 }
409
410 // Detect navigation and reset video element
411 function handleNavigation() {
412 const newUrl = window.location.href;
413
414 // Check if URL changed (navigation to new video)
415 if (newUrl !== currentVideoUrl) {
416 console.log('Navigation detected, URL changed from', currentVideoUrl, 'to', newUrl);
417 currentVideoUrl = newUrl;
418
419 // If currently describing, stop and restart with new video
420 if (isDescribing) {
421 console.log('Restarting descriptions for new video...');
422 stopDescribing();
423
424 // Wait for new video element to load
425 setTimeout(() => {
426 videoElement = document.querySelector('video.html5-main-video');
427 if (videoElement) {
428 console.log('New video element found, restarting descriptions');
429 startDescribing();
430 } else {
431 console.log('No video element found after navigation');
432 updateStatus('No video found - navigate to a video');
433 }
434 }, 2000);
435 } else {
436 // Just reset the video element reference
437 videoElement = null;
438 }
439 }
440 }
441
442 // Initialize extension
443 function init() {
444 console.log('YouTube Live Stream Audio Describer initialized');
445
446 // Store initial URL
447 currentVideoUrl = window.location.href;
448
449 // Wait for page to be ready
450 if (document.readyState === 'loading') {
451 document.addEventListener('DOMContentLoaded', createControlPanel);
452 } else {
453 createControlPanel();
454 }
455
456 // Re-create panel on navigation (YouTube is a SPA)
457 const observer = new MutationObserver(debounce(() => {
458 if (!document.getElementById('audio-describer-panel')) {
459 createControlPanel();
460 }
461 }, 1000));
462
463 observer.observe(document.body, {
464 childList: true,
465 subtree: true
466 });
467
468 // Detect URL changes for navigation (YouTube SPA)
469 let lastUrl = window.location.href;
470 new MutationObserver(() => {
471 const currentUrl = window.location.href;
472 if (currentUrl !== lastUrl) {
473 lastUrl = currentUrl;
474 handleNavigation();
475 }
476 }).observe(document.querySelector('title'), {
477 childList: true,
478 subtree: true
479 });
480
481 // Also listen to popstate for back/forward navigation
482 window.addEventListener('popstate', handleNavigation);
483
484 // Keyboard shortcut: Alt+D to toggle
485 document.addEventListener('keydown', (e) => {
486 if (e.altKey && e.key === 'd') {
487 e.preventDefault();
488 if (isDescribing) {
489 stopDescribing();
490 } else {
491 startDescribing();
492 }
493 }
494 });
495
496 console.log('Keyboard shortcut: Alt+D to toggle audio descriptions');
497 }
498
499 // Start the extension
500 init();
501})();