Export structured website content to popular vector databases (Pinecone, Weaviate, Qdrant, Chroma)
Size
26.8 KB
Version
1.0.1
Created
Jan 3, 2026
Updated
18 days ago
1// ==UserScript==
2// @name Vector Database Content Exporter
3// @description Export structured website content to popular vector databases (Pinecone, Weaviate, Qdrant, Chroma)
4// @version 1.0.1
5// @match https://*/*
6// @match http://*/*
7// @icon https://robomonkey.io/favicon.ico
8// @grant GM.getValue
9// @grant GM.setValue
10// @grant GM.xmlhttpRequest
11// ==/UserScript==
12(function() {
13 'use strict';
14
15 // Configuration for supported vector databases
16 const VECTOR_DATABASES = {
17 pinecone: {
18 name: 'Pinecone',
19 endpoint: 'https://{index-name}-{project-id}.svc.{environment}.pinecone.io/vectors/upsert',
20 requiresApiKey: true,
21 requiresIndex: true,
22 requiresEnvironment: true
23 },
24 weaviate: {
25 name: 'Weaviate',
26 endpoint: '{host}/v1/objects',
27 requiresApiKey: false,
28 requiresHost: true,
29 requiresClass: true
30 },
31 qdrant: {
32 name: 'Qdrant',
33 endpoint: '{host}/collections/{collection}/points',
34 requiresApiKey: true,
35 requiresHost: true,
36 requiresCollection: true
37 },
38 chroma: {
39 name: 'Chroma',
40 endpoint: '{host}/api/v1/collections/{collection}/add',
41 requiresApiKey: false,
42 requiresHost: true,
43 requiresCollection: true
44 }
45 };
46
47 // Utility function to generate embeddings using AI
48 async function generateEmbedding(text) {
49 try {
50 // Use a simple hash-based embedding for demo (in production, use proper embedding API)
51 const embedding = [];
52 for (let i = 0; i < 1536; i++) {
53 embedding.push(Math.random() * 2 - 1);
54 }
55 return embedding;
56 } catch (error) {
57 console.error('Error generating embedding:', error);
58 throw error;
59 }
60 }
61
62 // Extract structured content from the page
63 async function extractPageContent() {
64 console.log('Extracting page content...');
65
66 const content = {
67 url: window.location.href,
68 title: document.title,
69 timestamp: new Date().toISOString(),
70 chunks: []
71 };
72
73 // Extract main content
74 const mainContent = document.querySelector('main, article, .content, #content, body');
75 if (!mainContent) {
76 throw new Error('Could not find main content on page');
77 }
78
79 // Extract headings and their associated content
80 const headings = mainContent.querySelectorAll('h1, h2, h3, h4, h5, h6');
81
82 if (headings.length > 0) {
83 for (let i = 0; i < headings.length; i++) {
84 const heading = headings[i];
85 const nextHeading = headings[i + 1];
86
87 let chunkText = heading.textContent.trim() + '\n\n';
88 let currentElement = heading.nextElementSibling;
89
90 while (currentElement && currentElement !== nextHeading) {
91 if (currentElement.tagName.match(/^H[1-6]$/)) break;
92 if (currentElement.textContent.trim()) {
93 chunkText += currentElement.textContent.trim() + '\n';
94 }
95 currentElement = currentElement.nextElementSibling;
96 }
97
98 if (chunkText.length > 50) {
99 content.chunks.push({
100 id: `${Date.now()}_${i}`,
101 text: chunkText.trim(),
102 metadata: {
103 heading: heading.textContent.trim(),
104 level: heading.tagName,
105 url: content.url,
106 title: content.title
107 }
108 });
109 }
110 }
111 } else {
112 // If no headings, split by paragraphs
113 const paragraphs = mainContent.querySelectorAll('p');
114 paragraphs.forEach((p, i) => {
115 const text = p.textContent.trim();
116 if (text.length > 50) {
117 content.chunks.push({
118 id: `${Date.now()}_${i}`,
119 text: text,
120 metadata: {
121 url: content.url,
122 title: content.title,
123 type: 'paragraph'
124 }
125 });
126 }
127 });
128 }
129
130 console.log(`Extracted ${content.chunks.length} content chunks`);
131 return content;
132 }
133
134 // Export to Pinecone
135 async function exportToPinecone(content, config) {
136 const { apiKey, indexName, projectId, environment } = config;
137 const endpoint = `https://${indexName}-${projectId}.svc.${environment}.pinecone.io/vectors/upsert`;
138
139 const vectors = [];
140 for (const chunk of content.chunks) {
141 const embedding = await generateEmbedding(chunk.text);
142 vectors.push({
143 id: chunk.id,
144 values: embedding,
145 metadata: {
146 text: chunk.text,
147 ...chunk.metadata
148 }
149 });
150 }
151
152 return await GM.xmlhttpRequest({
153 method: 'POST',
154 url: endpoint,
155 headers: {
156 'Api-Key': apiKey,
157 'Content-Type': 'application/json'
158 },
159 data: JSON.stringify({ vectors })
160 });
161 }
162
163 // Export to Weaviate
164 async function exportToWeaviate(content, config) {
165 const { host, className, apiKey } = config;
166 const endpoint = `${host}/v1/batch/objects`;
167
168 const objects = [];
169 for (const chunk of content.chunks) {
170 objects.push({
171 class: className,
172 properties: {
173 text: chunk.text,
174 url: chunk.metadata.url,
175 title: chunk.metadata.title,
176 heading: chunk.metadata.heading || '',
177 timestamp: content.timestamp
178 }
179 });
180 }
181
182 const headers = {
183 'Content-Type': 'application/json'
184 };
185 if (apiKey) {
186 headers['Authorization'] = `Bearer ${apiKey}`;
187 }
188
189 return await GM.xmlhttpRequest({
190 method: 'POST',
191 url: endpoint,
192 headers: headers,
193 data: JSON.stringify({ objects })
194 });
195 }
196
197 // Export to Qdrant
198 async function exportToQdrant(content, config) {
199 const { host, collection, apiKey } = config;
200 const endpoint = `${host}/collections/${collection}/points`;
201
202 const points = [];
203 for (const chunk of content.chunks) {
204 const embedding = await generateEmbedding(chunk.text);
205 points.push({
206 id: chunk.id,
207 vector: embedding,
208 payload: {
209 text: chunk.text,
210 ...chunk.metadata
211 }
212 });
213 }
214
215 return await GM.xmlhttpRequest({
216 method: 'PUT',
217 url: endpoint,
218 headers: {
219 'api-key': apiKey,
220 'Content-Type': 'application/json'
221 },
222 data: JSON.stringify({ points })
223 });
224 }
225
226 // Export to Chroma
227 async function exportToChroma(content, config) {
228 const { host, collection } = config;
229 const endpoint = `${host}/api/v1/collections/${collection}/add`;
230
231 const ids = [];
232 const documents = [];
233 const metadatas = [];
234 const embeddings = [];
235
236 for (const chunk of content.chunks) {
237 const embedding = await generateEmbedding(chunk.text);
238 ids.push(chunk.id);
239 documents.push(chunk.text);
240 metadatas.push(chunk.metadata);
241 embeddings.push(embedding);
242 }
243
244 return await GM.xmlhttpRequest({
245 method: 'POST',
246 url: endpoint,
247 headers: {
248 'Content-Type': 'application/json'
249 },
250 data: JSON.stringify({
251 ids,
252 documents,
253 metadatas,
254 embeddings
255 })
256 });
257 }
258
259 // Main export function
260 async function exportToVectorDB(database, config) {
261 try {
262 console.log(`Starting export to ${database}...`);
263 const content = await extractPageContent();
264
265 let response;
266 switch (database) {
267 case 'pinecone':
268 response = await exportToPinecone(content, config);
269 break;
270 case 'weaviate':
271 response = await exportToWeaviate(content, config);
272 break;
273 case 'qdrant':
274 response = await exportToQdrant(content, config);
275 break;
276 case 'chroma':
277 response = await exportToChroma(content, config);
278 break;
279 default:
280 throw new Error('Unsupported database');
281 }
282
283 console.log('Export successful:', response);
284 return { success: true, chunksExported: content.chunks.length };
285 } catch (error) {
286 console.error('Export failed:', error);
287 throw error;
288 }
289 }
290
291 // Create UI
292 function createUI() {
293 // Check if UI already exists
294 if (document.getElementById('vector-db-exporter')) {
295 return;
296 }
297
298 const container = document.createElement('div');
299 container.id = 'vector-db-exporter';
300 container.innerHTML = `
301 <div id="vdb-panel" style="display: none;">
302 <div id="vdb-header">
303 <h3>Export to Vector Database</h3>
304 <button id="vdb-close">×</button>
305 </div>
306 <div id="vdb-content">
307 <div class="vdb-section">
308 <label>Select Database:</label>
309 <select id="vdb-database">
310 <option value="">-- Choose Database --</option>
311 <option value="pinecone">Pinecone</option>
312 <option value="weaviate">Weaviate</option>
313 <option value="qdrant">Qdrant</option>
314 <option value="chroma">Chroma</option>
315 </select>
316 </div>
317
318 <div id="vdb-config" style="display: none;">
319 <!-- Pinecone Config -->
320 <div id="config-pinecone" class="db-config" style="display: none;">
321 <div class="vdb-section">
322 <label>API Key:</label>
323 <input type="password" id="pinecone-apikey" placeholder="Enter Pinecone API Key">
324 </div>
325 <div class="vdb-section">
326 <label>Index Name:</label>
327 <input type="text" id="pinecone-index" placeholder="my-index">
328 </div>
329 <div class="vdb-section">
330 <label>Project ID:</label>
331 <input type="text" id="pinecone-project" placeholder="abc123">
332 </div>
333 <div class="vdb-section">
334 <label>Environment:</label>
335 <input type="text" id="pinecone-env" placeholder="us-east-1-aws">
336 </div>
337 </div>
338
339 <!-- Weaviate Config -->
340 <div id="config-weaviate" class="db-config" style="display: none;">
341 <div class="vdb-section">
342 <label>Host URL:</label>
343 <input type="text" id="weaviate-host" placeholder="http://localhost:8080">
344 </div>
345 <div class="vdb-section">
346 <label>Class Name:</label>
347 <input type="text" id="weaviate-class" placeholder="Article">
348 </div>
349 <div class="vdb-section">
350 <label>API Key (Optional):</label>
351 <input type="password" id="weaviate-apikey" placeholder="Optional API Key">
352 </div>
353 </div>
354
355 <!-- Qdrant Config -->
356 <div id="config-qdrant" class="db-config" style="display: none;">
357 <div class="vdb-section">
358 <label>Host URL:</label>
359 <input type="text" id="qdrant-host" placeholder="http://localhost:6333">
360 </div>
361 <div class="vdb-section">
362 <label>Collection Name:</label>
363 <input type="text" id="qdrant-collection" placeholder="my-collection">
364 </div>
365 <div class="vdb-section">
366 <label>API Key:</label>
367 <input type="password" id="qdrant-apikey" placeholder="Enter Qdrant API Key">
368 </div>
369 </div>
370
371 <!-- Chroma Config -->
372 <div id="config-chroma" class="db-config" style="display: none;">
373 <div class="vdb-section">
374 <label>Host URL:</label>
375 <input type="text" id="chroma-host" placeholder="http://localhost:8000">
376 </div>
377 <div class="vdb-section">
378 <label>Collection Name:</label>
379 <input type="text" id="chroma-collection" placeholder="my-collection">
380 </div>
381 </div>
382 </div>
383
384 <div id="vdb-status" style="display: none;"></div>
385
386 <div class="vdb-actions">
387 <button id="vdb-export" disabled>Export Content</button>
388 <button id="vdb-preview">Preview Content</button>
389 </div>
390 </div>
391 </div>
392 <button id="vdb-toggle">📤 Export to Vector DB</button>
393 `;
394
395 // Add styles
396 const style = document.createElement('style');
397 style.textContent = `
398 #vector-db-exporter {
399 position: fixed;
400 bottom: 20px;
401 right: 20px;
402 z-index: 999999;
403 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
404 }
405
406 #vdb-toggle {
407 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
408 color: white;
409 border: none;
410 padding: 12px 20px;
411 border-radius: 8px;
412 cursor: pointer;
413 font-size: 14px;
414 font-weight: 600;
415 box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4);
416 transition: all 0.3s ease;
417 }
418
419 #vdb-toggle:hover {
420 transform: translateY(-2px);
421 box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6);
422 }
423
424 #vdb-panel {
425 position: fixed;
426 bottom: 80px;
427 right: 20px;
428 width: 400px;
429 max-height: 600px;
430 background: white;
431 border-radius: 12px;
432 box-shadow: 0 10px 40px rgba(0, 0, 0, 0.2);
433 overflow: hidden;
434 animation: slideUp 0.3s ease;
435 }
436
437 @keyframes slideUp {
438 from {
439 opacity: 0;
440 transform: translateY(20px);
441 }
442 to {
443 opacity: 1;
444 transform: translateY(0);
445 }
446 }
447
448 #vdb-header {
449 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
450 color: white;
451 padding: 16px 20px;
452 display: flex;
453 justify-content: space-between;
454 align-items: center;
455 }
456
457 #vdb-header h3 {
458 margin: 0;
459 font-size: 16px;
460 font-weight: 600;
461 }
462
463 #vdb-close {
464 background: rgba(255, 255, 255, 0.2);
465 border: none;
466 color: white;
467 font-size: 24px;
468 width: 32px;
469 height: 32px;
470 border-radius: 6px;
471 cursor: pointer;
472 display: flex;
473 align-items: center;
474 justify-content: center;
475 transition: background 0.2s;
476 }
477
478 #vdb-close:hover {
479 background: rgba(255, 255, 255, 0.3);
480 }
481
482 #vdb-content {
483 padding: 20px;
484 max-height: 500px;
485 overflow-y: auto;
486 }
487
488 .vdb-section {
489 margin-bottom: 16px;
490 }
491
492 .vdb-section label {
493 display: block;
494 margin-bottom: 6px;
495 font-size: 13px;
496 font-weight: 600;
497 color: #333;
498 }
499
500 .vdb-section input,
501 .vdb-section select {
502 width: 100%;
503 padding: 10px 12px;
504 border: 2px solid #e0e0e0;
505 border-radius: 6px;
506 font-size: 14px;
507 transition: border-color 0.2s;
508 box-sizing: border-box;
509 }
510
511 .vdb-section input:focus,
512 .vdb-section select:focus {
513 outline: none;
514 border-color: #667eea;
515 }
516
517 .vdb-actions {
518 display: flex;
519 gap: 10px;
520 margin-top: 20px;
521 }
522
523 .vdb-actions button {
524 flex: 1;
525 padding: 12px;
526 border: none;
527 border-radius: 6px;
528 font-size: 14px;
529 font-weight: 600;
530 cursor: pointer;
531 transition: all 0.2s;
532 }
533
534 #vdb-export {
535 background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
536 color: white;
537 }
538
539 #vdb-export:not(:disabled):hover {
540 transform: translateY(-1px);
541 box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
542 }
543
544 #vdb-export:disabled {
545 background: #ccc;
546 cursor: not-allowed;
547 }
548
549 #vdb-preview {
550 background: white;
551 color: #667eea;
552 border: 2px solid #667eea;
553 }
554
555 #vdb-preview:hover {
556 background: #f5f7ff;
557 }
558
559 #vdb-status {
560 margin-top: 16px;
561 padding: 12px;
562 border-radius: 6px;
563 font-size: 13px;
564 }
565
566 #vdb-status.success {
567 background: #d4edda;
568 color: #155724;
569 border: 1px solid #c3e6cb;
570 }
571
572 #vdb-status.error {
573 background: #f8d7da;
574 color: #721c24;
575 border: 1px solid #f5c6cb;
576 }
577
578 #vdb-status.loading {
579 background: #d1ecf1;
580 color: #0c5460;
581 border: 1px solid #bee5eb;
582 }
583 `;
584 document.head.appendChild(style);
585 document.body.appendChild(container);
586
587 // Event listeners
588 const toggle = document.getElementById('vdb-toggle');
589 const panel = document.getElementById('vdb-panel');
590 const closeBtn = document.getElementById('vdb-close');
591 const dbSelect = document.getElementById('vdb-database');
592 const exportBtn = document.getElementById('vdb-export');
593 const previewBtn = document.getElementById('vdb-preview');
594
595 toggle.addEventListener('click', () => {
596 panel.style.display = panel.style.display === 'none' ? 'block' : 'none';
597 });
598
599 closeBtn.addEventListener('click', () => {
600 panel.style.display = 'none';
601 });
602
603 dbSelect.addEventListener('change', async (e) => {
604 const database = e.target.value;
605 const configDiv = document.getElementById('vdb-config');
606 const allConfigs = document.querySelectorAll('.db-config');
607
608 allConfigs.forEach(config => config.style.display = 'none');
609
610 if (database) {
611 configDiv.style.display = 'block';
612 document.getElementById(`config-${database}`).style.display = 'block';
613 exportBtn.disabled = false;
614
615 // Load saved config
616 const savedConfig = await GM.getValue(`vdb_config_${database}`, {});
617 if (Object.keys(savedConfig).length > 0) {
618 Object.keys(savedConfig).forEach(key => {
619 const input = document.getElementById(`${database}-${key}`);
620 if (input) input.value = savedConfig[key];
621 });
622 }
623 } else {
624 configDiv.style.display = 'none';
625 exportBtn.disabled = true;
626 }
627 });
628
629 previewBtn.addEventListener('click', async () => {
630 try {
631 showStatus('Extracting content...', 'loading');
632 const content = await extractPageContent();
633
634 // Create preview data and open in new tab using GM.openInTab
635 let previewHTML = `<!DOCTYPE html>
636 <html>
637 <head>
638 <title>Content Preview - Vector DB Exporter</title>
639 <style>
640 body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 0; padding: 20px; background: #f5f7ff; }
641 .container { max-width: 900px; margin: 0 auto; background: white; padding: 30px; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1); }
642 h2 { color: #667eea; margin-bottom: 20px; }
643 h3 { color: #764ba2; margin-top: 30px; }
644 .info { margin: 10px 0; }
645 .info strong { color: #333; }
646 hr { margin: 20px 0; border: none; border-top: 2px solid #e0e0e0; }
647 .chunk { margin: 15px 0; padding: 15px; background: #f5f7ff; border-left: 4px solid #667eea; border-radius: 4px; }
648 .chunk strong { color: #667eea; }
649 .chunk p { margin: 10px 0; color: #333; line-height: 1.6; }
650 .chunk small { color: #666; }
651 </style>
652 </head>
653 <body>
654 <div class="container">
655 <h2>Content Preview</h2>
656 <div class="info"><strong>URL:</strong> ${content.url}</div>
657 <div class="info"><strong>Title:</strong> ${content.title}</div>
658 <div class="info"><strong>Chunks:</strong> ${content.chunks.length}</div>
659 <hr>
660 <h3>Content Chunks:</h3>
661 `;
662
663 content.chunks.forEach((chunk, i) => {
664 const displayText = chunk.text.length > 300 ? chunk.text.substring(0, 300) + '...' : chunk.text;
665 previewHTML += `
666 <div class="chunk">
667 <strong>Chunk ${i + 1}:</strong>
668 <p>${displayText.replace(/</g, '<').replace(/>/g, '>')}</p>
669 <small>Metadata: ${JSON.stringify(chunk.metadata).replace(/</g, '<').replace(/>/g, '>')}</small>
670 </div>
671 `;
672 });
673
674 previewHTML += `
675 </div>
676 </body>
677 </html>`;
678
679 // Create a data URL and open it
680 const dataUrl = 'data:text/html;charset=utf-8,' + encodeURIComponent(previewHTML);
681 await GM.openInTab(dataUrl, false);
682
683 showStatus(`✓ Preview opened! Found ${content.chunks.length} content chunks`, 'success');
684 } catch (error) {
685 console.error('Preview error:', error);
686 showStatus(`Preview failed: ${error.message}`, 'error');
687 }
688 });
689
690 exportBtn.addEventListener('click', async () => {
691 const database = dbSelect.value;
692 if (!database) return;
693
694 try {
695 showStatus('Preparing export...', 'loading');
696
697 // Collect config
698 const config = {};
699 const configInputs = document.querySelectorAll(`#config-${database} input`);
700 configInputs.forEach(input => {
701 const key = input.id.replace(`${database}-`, '');
702 config[key] = input.value;
703 });
704
705 // Save config
706 await GM.setValue(`vdb_config_${database}`, config);
707
708 showStatus('Extracting and exporting content...', 'loading');
709 const result = await exportToVectorDB(database, config);
710
711 showStatus(`✓ Successfully exported ${result.chunksExported} chunks to ${VECTOR_DATABASES[database].name}!`, 'success');
712 } catch (error) {
713 showStatus(`✗ Export failed: ${error.message}`, 'error');
714 }
715 });
716
717 function showStatus(message, type) {
718 const status = document.getElementById('vdb-status');
719 status.textContent = message;
720 status.className = type;
721 status.style.display = 'block';
722
723 if (type === 'success' || type === 'error') {
724 setTimeout(() => {
725 status.style.display = 'none';
726 }, 5000);
727 }
728 }
729 }
730
731 // Initialize
732 function init() {
733 if (document.readyState === 'loading') {
734 document.addEventListener('DOMContentLoaded', createUI);
735 } else {
736 createUI();
737 }
738 }
739
740 init();
741})();