Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
Size
13.3 KB
Version
1.0.1
Created
Mar 12, 2026
Updated
about 1 month ago
1// ==UserScript==
2// @name YouTube Live Stream Audio Describer
3// @description Real-time AI-powered audio descriptions for YouTube live streams to assist blind users
4// @version 1.0.1
5// @match https://*.youtube.com/*
6// @icon https://www.youtube.com/s/desktop/38f8912f/img/favicon_32x32.png
7// ==/UserScript==
8(function() {
9 'use strict';
10
11 // Configuration
12 const CONFIG = {
13 captureInterval: 8000, // Capture frame every 8 seconds
14 maxDescriptionLength: 150,
15 speechRate: 1.0,
16 speechVolume: 1.0,
17 speechPitch: 1.0
18 };
19
20 // State management
21 let isDescribing = false;
22 let captureIntervalId = null;
23 let lastDescription = '';
24 let videoElement = null;
25 let controlPanel = null;
26 let isSpeaking = false;
27 let descriptionQueue = [];
28
29 // Utility: Debounce function
30 function debounce(func, wait) {
31 let timeout;
32 return function executedFunction(...args) {
33 const later = () => {
34 clearTimeout(timeout);
35 func(...args);
36 };
37 clearTimeout(timeout);
38 timeout = setTimeout(later, wait);
39 };
40 }
41
42 // Capture video frame as base64 image
43 function captureVideoFrame(video) {
44 try {
45 const canvas = document.createElement('canvas');
46 canvas.width = video.videoWidth || 640;
47 canvas.height = video.videoHeight || 360;
48
49 const ctx = canvas.getContext('2d');
50 ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
51
52 // Convert to base64 with reduced quality for faster processing
53 const dataUrl = canvas.toDataURL('image/jpeg', 0.7);
54 console.log('Frame captured successfully, size:', canvas.width, 'x', canvas.height);
55 return dataUrl;
56 } catch (error) {
57 console.error('Error capturing video frame:', error);
58 return null;
59 }
60 }
61
62 // Analyze frame with AI
63 async function analyzeFrame(frameDataUrl) {
64 try {
65 console.log('Sending frame to AI for analysis...');
66
67 const prompt = `You are describing a live video stream for a blind person. Analyze this video frame and provide a concise, clear description of what's happening. Focus on:
68- Main subjects or people in the frame
69- Their actions or activities
70- Important visual elements or text
71- Scene setting or environment
72- Any significant changes from typical content
73
74Keep the description under ${CONFIG.maxDescriptionLength} words and make it natural for text-to-speech. Be specific and helpful.
75
76Image data: ${frameDataUrl}`;
77
78 const description = await RM.aiCall(prompt);
79 console.log('AI analysis complete:', description);
80 return description;
81 } catch (error) {
82 console.error('Error analyzing frame with AI:', error);
83 return null;
84 }
85 }
86
87 // Speak description using Web Speech API
88 function speakDescription(text) {
89 if (!text || text === lastDescription) {
90 console.log('Skipping duplicate or empty description');
91 return;
92 }
93
94 // Add to queue
95 descriptionQueue.push(text);
96 lastDescription = text;
97
98 // Process queue if not already speaking
99 if (!isSpeaking) {
100 processDescriptionQueue();
101 }
102 }
103
104 // Process description queue
105 function processDescriptionQueue() {
106 if (descriptionQueue.length === 0) {
107 isSpeaking = false;
108 return;
109 }
110
111 isSpeaking = true;
112 const text = descriptionQueue.shift();
113
114 if ('speechSynthesis' in window) {
115 // Cancel any ongoing speech
116 window.speechSynthesis.cancel();
117
118 const utterance = new SpeechSynthesisUtterance(text);
119 utterance.rate = CONFIG.speechRate;
120 utterance.volume = CONFIG.speechVolume;
121 utterance.pitch = CONFIG.speechPitch;
122 utterance.lang = 'en-US';
123
124 utterance.onend = () => {
125 console.log('Finished speaking description');
126 // Process next in queue after a short delay
127 setTimeout(() => processDescriptionQueue(), 500);
128 };
129
130 utterance.onerror = (event) => {
131 console.error('Speech synthesis error:', event);
132 isSpeaking = false;
133 processDescriptionQueue();
134 };
135
136 console.log('Speaking:', text);
137 window.speechSynthesis.speak(utterance);
138
139 // Update status in control panel
140 updateStatus('Describing...');
141 } else {
142 console.error('Speech synthesis not supported');
143 isSpeaking = false;
144 }
145 }
146
147 // Main capture and describe loop
148 async function captureAndDescribe() {
149 if (!isDescribing || !videoElement) {
150 return;
151 }
152
153 console.log('Starting frame capture and analysis...');
154 updateStatus('Capturing frame...');
155
156 const frameData = captureVideoFrame(videoElement);
157 if (!frameData) {
158 console.error('Failed to capture frame');
159 updateStatus('Error: Failed to capture frame');
160 return;
161 }
162
163 updateStatus('Analyzing with AI...');
164 const description = await analyzeFrame(frameData);
165
166 if (description) {
167 speakDescription(description);
168 } else {
169 updateStatus('Error: AI analysis failed');
170 }
171 }
172
173 // Start describing
174 async function startDescribing() {
175 if (isDescribing) {
176 console.log('Already describing');
177 return;
178 }
179
180 videoElement = document.querySelector('video.html5-main-video');
181 if (!videoElement) {
182 alert('No video found on this page. Please navigate to a YouTube video or live stream.');
183 return;
184 }
185
186 console.log('Starting audio descriptions...');
187 isDescribing = true;
188 updateControlPanel();
189
190 // Initial description
191 await captureAndDescribe();
192
193 // Set up interval for continuous descriptions
194 captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
195
196 updateStatus('Active - Describing every ' + (CONFIG.captureInterval / 1000) + ' seconds');
197 }
198
199 // Stop describing
200 function stopDescribing() {
201 console.log('Stopping audio descriptions...');
202 isDescribing = false;
203
204 if (captureIntervalId) {
205 clearInterval(captureIntervalId);
206 captureIntervalId = null;
207 }
208
209 // Stop any ongoing speech
210 if ('speechSynthesis' in window) {
211 window.speechSynthesis.cancel();
212 }
213
214 // Clear queue
215 descriptionQueue = [];
216 isSpeaking = false;
217
218 updateControlPanel();
219 updateStatus('Stopped');
220 }
221
222 // Update status display
223 function updateStatus(message) {
224 const statusElement = document.getElementById('audio-describer-status');
225 if (statusElement) {
226 statusElement.textContent = message;
227 }
228 }
229
230 // Create control panel UI
231 function createControlPanel() {
232 // Remove existing panel if any
233 const existing = document.getElementById('audio-describer-panel');
234 if (existing) {
235 existing.remove();
236 }
237
238 const panel = document.createElement('div');
239 panel.id = 'audio-describer-panel';
240 panel.style.cssText = `
241 position: fixed;
242 top: 80px;
243 right: 20px;
244 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
245 color: white;
246 padding: 20px;
247 border-radius: 12px;
248 box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
249 z-index: 10000;
250 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
251 min-width: 280px;
252 backdrop-filter: blur(10px);
253 `;
254
255 panel.innerHTML = `
256 <div style="margin-bottom: 15px;">
257 <h3 style="margin: 0 0 5px 0; font-size: 16px; font-weight: 600;">🎙️ Audio Describer</h3>
258 <div id="audio-describer-status" style="font-size: 12px; opacity: 0.9; margin-top: 5px;">Ready</div>
259 </div>
260
261 <button id="audio-describer-toggle" style="
262 width: 100%;
263 padding: 12px;
264 background: white;
265 color: #667eea;
266 border: none;
267 border-radius: 8px;
268 font-size: 14px;
269 font-weight: 600;
270 cursor: pointer;
271 margin-bottom: 12px;
272 transition: all 0.3s ease;
273 ">Start Describing</button>
274
275 <div style="margin-top: 15px; padding-top: 15px; border-top: 1px solid rgba(255,255,255,0.2);">
276 <label style="display: block; font-size: 12px; margin-bottom: 8px; opacity: 0.9;">
277 Description Interval: <span id="interval-value">${CONFIG.captureInterval / 1000}s</span>
278 </label>
279 <input type="range" id="interval-slider" min="5" max="15" value="${CONFIG.captureInterval / 1000}"
280 style="width: 100%; cursor: pointer;">
281
282 <label style="display: block; font-size: 12px; margin: 12px 0 8px 0; opacity: 0.9;">
283 Speech Speed: <span id="speed-value">${CONFIG.speechRate}x</span>
284 </label>
285 <input type="range" id="speed-slider" min="0.5" max="2" step="0.1" value="${CONFIG.speechRate}"
286 style="width: 100%; cursor: pointer;">
287 </div>
288
289 <div style="margin-top: 15px; font-size: 11px; opacity: 0.7; line-height: 1.4;">
290 This extension captures video frames and provides AI-powered audio descriptions for accessibility.
291 </div>
292 `;
293
294 document.body.appendChild(panel);
295 controlPanel = panel;
296
297 // Add event listeners
298 const toggleButton = document.getElementById('audio-describer-toggle');
299 toggleButton.addEventListener('click', () => {
300 if (isDescribing) {
301 stopDescribing();
302 } else {
303 startDescribing();
304 }
305 });
306
307 // Interval slider
308 const intervalSlider = document.getElementById('interval-slider');
309 const intervalValue = document.getElementById('interval-value');
310 intervalSlider.addEventListener('input', (e) => {
311 const value = parseInt(e.target.value);
312 CONFIG.captureInterval = value * 1000;
313 intervalValue.textContent = value + 's';
314
315 // Restart interval if currently describing
316 if (isDescribing && captureIntervalId) {
317 clearInterval(captureIntervalId);
318 captureIntervalId = setInterval(captureAndDescribe, CONFIG.captureInterval);
319 updateStatus('Active - Describing every ' + value + ' seconds');
320 }
321 });
322
323 // Speed slider
324 const speedSlider = document.getElementById('speed-slider');
325 const speedValue = document.getElementById('speed-value');
326 speedSlider.addEventListener('input', (e) => {
327 const value = parseFloat(e.target.value);
328 CONFIG.speechRate = value;
329 speedValue.textContent = value.toFixed(1) + 'x';
330 });
331
332 console.log('Control panel created');
333 }
334
335 // Update control panel state
336 function updateControlPanel() {
337 const toggleButton = document.getElementById('audio-describer-toggle');
338 if (toggleButton) {
339 if (isDescribing) {
340 toggleButton.textContent = 'Stop Describing';
341 toggleButton.style.background = '#ff4757';
342 toggleButton.style.color = 'white';
343 } else {
344 toggleButton.textContent = 'Start Describing';
345 toggleButton.style.background = 'white';
346 toggleButton.style.color = '#667eea';
347 }
348 }
349 }
350
351 // Initialize extension
352 function init() {
353 console.log('YouTube Live Stream Audio Describer initialized');
354
355 // Wait for page to be ready
356 if (document.readyState === 'loading') {
357 document.addEventListener('DOMContentLoaded', createControlPanel);
358 } else {
359 createControlPanel();
360 }
361
362 // Re-create panel on navigation (YouTube is a SPA)
363 const observer = new MutationObserver(debounce(() => {
364 if (!document.getElementById('audio-describer-panel')) {
365 createControlPanel();
366 }
367 }, 1000));
368
369 observer.observe(document.body, {
370 childList: true,
371 subtree: true
372 });
373
374 // Keyboard shortcut: Alt+D to toggle
375 document.addEventListener('keydown', (e) => {
376 if (e.altKey && e.key === 'd') {
377 e.preventDefault();
378 if (isDescribing) {
379 stopDescribing();
380 } else {
381 startDescribing();
382 }
383 }
384 });
385
386 console.log('Keyboard shortcut: Alt+D to toggle audio descriptions');
387 }
388
389 // Start the extension
390 init();
391})();