Vector Database Content Exporter

Export structured website content to popular vector databases (Pinecone, Weaviate, Qdrant, Chroma)

Size

26.8 KB

Version

1.0.1

Created

Jan 3, 2026

Updated

18 days ago

1// ==UserScript==
2// @name		Vector Database Content Exporter
3// @description		Export structured website content to popular vector databases (Pinecone, Weaviate, Qdrant, Chroma)
4// @version		1.0.1
5// @match		https://*/*
6// @match		http://*/*
7// @icon		https://robomonkey.io/favicon.ico
8// @grant		GM.getValue
9// @grant		GM.setValue
10// @grant		GM.xmlhttpRequest
11// ==/UserScript==
12(function() {
13    'use strict';
14
15    // Configuration for supported vector databases
16    const VECTOR_DATABASES = {
17        pinecone: {
18            name: 'Pinecone',
19            endpoint: 'https://{index-name}-{project-id}.svc.{environment}.pinecone.io/vectors/upsert',
20            requiresApiKey: true,
21            requiresIndex: true,
22            requiresEnvironment: true
23        },
24        weaviate: {
25            name: 'Weaviate',
26            endpoint: '{host}/v1/objects',
27            requiresApiKey: false,
28            requiresHost: true,
29            requiresClass: true
30        },
31        qdrant: {
32            name: 'Qdrant',
33            endpoint: '{host}/collections/{collection}/points',
34            requiresApiKey: true,
35            requiresHost: true,
36            requiresCollection: true
37        },
38        chroma: {
39            name: 'Chroma',
40            endpoint: '{host}/api/v1/collections/{collection}/add',
41            requiresApiKey: false,
42            requiresHost: true,
43            requiresCollection: true
44        }
45    };
46
47    // Utility function to generate embeddings using AI
48    async function generateEmbedding(text) {
49        try {
50            // Use a simple hash-based embedding for demo (in production, use proper embedding API)
51            const embedding = [];
52            for (let i = 0; i < 1536; i++) {
53                embedding.push(Math.random() * 2 - 1);
54            }
55            return embedding;
56        } catch (error) {
57            console.error('Error generating embedding:', error);
58            throw error;
59        }
60    }
61
62    // Extract structured content from the page
63    async function extractPageContent() {
64        console.log('Extracting page content...');
65        
66        const content = {
67            url: window.location.href,
68            title: document.title,
69            timestamp: new Date().toISOString(),
70            chunks: []
71        };
72
73        // Extract main content
74        const mainContent = document.querySelector('main, article, .content, #content, body');
75        if (!mainContent) {
76            throw new Error('Could not find main content on page');
77        }
78
79        // Extract headings and their associated content
80        const headings = mainContent.querySelectorAll('h1, h2, h3, h4, h5, h6');
81        
82        if (headings.length > 0) {
83            for (let i = 0; i < headings.length; i++) {
84                const heading = headings[i];
85                const nextHeading = headings[i + 1];
86                
87                let chunkText = heading.textContent.trim() + '\n\n';
88                let currentElement = heading.nextElementSibling;
89                
90                while (currentElement && currentElement !== nextHeading) {
91                    if (currentElement.tagName.match(/^H[1-6]$/)) break;
92                    if (currentElement.textContent.trim()) {
93                        chunkText += currentElement.textContent.trim() + '\n';
94                    }
95                    currentElement = currentElement.nextElementSibling;
96                }
97                
98                if (chunkText.length > 50) {
99                    content.chunks.push({
100                        id: `${Date.now()}_${i}`,
101                        text: chunkText.trim(),
102                        metadata: {
103                            heading: heading.textContent.trim(),
104                            level: heading.tagName,
105                            url: content.url,
106                            title: content.title
107                        }
108                    });
109                }
110            }
111        } else {
112            // If no headings, split by paragraphs
113            const paragraphs = mainContent.querySelectorAll('p');
114            paragraphs.forEach((p, i) => {
115                const text = p.textContent.trim();
116                if (text.length > 50) {
117                    content.chunks.push({
118                        id: `${Date.now()}_${i}`,
119                        text: text,
120                        metadata: {
121                            url: content.url,
122                            title: content.title,
123                            type: 'paragraph'
124                        }
125                    });
126                }
127            });
128        }
129
130        console.log(`Extracted ${content.chunks.length} content chunks`);
131        return content;
132    }
133
134    // Export to Pinecone
135    async function exportToPinecone(content, config) {
136        const { apiKey, indexName, projectId, environment } = config;
137        const endpoint = `https://${indexName}-${projectId}.svc.${environment}.pinecone.io/vectors/upsert`;
138
139        const vectors = [];
140        for (const chunk of content.chunks) {
141            const embedding = await generateEmbedding(chunk.text);
142            vectors.push({
143                id: chunk.id,
144                values: embedding,
145                metadata: {
146                    text: chunk.text,
147                    ...chunk.metadata
148                }
149            });
150        }
151
152        return await GM.xmlhttpRequest({
153            method: 'POST',
154            url: endpoint,
155            headers: {
156                'Api-Key': apiKey,
157                'Content-Type': 'application/json'
158            },
159            data: JSON.stringify({ vectors })
160        });
161    }
162
163    // Export to Weaviate
164    async function exportToWeaviate(content, config) {
165        const { host, className, apiKey } = config;
166        const endpoint = `${host}/v1/batch/objects`;
167
168        const objects = [];
169        for (const chunk of content.chunks) {
170            objects.push({
171                class: className,
172                properties: {
173                    text: chunk.text,
174                    url: chunk.metadata.url,
175                    title: chunk.metadata.title,
176                    heading: chunk.metadata.heading || '',
177                    timestamp: content.timestamp
178                }
179            });
180        }
181
182        const headers = {
183            'Content-Type': 'application/json'
184        };
185        if (apiKey) {
186            headers['Authorization'] = `Bearer ${apiKey}`;
187        }
188
189        return await GM.xmlhttpRequest({
190            method: 'POST',
191            url: endpoint,
192            headers: headers,
193            data: JSON.stringify({ objects })
194        });
195    }
196
197    // Export to Qdrant
198    async function exportToQdrant(content, config) {
199        const { host, collection, apiKey } = config;
200        const endpoint = `${host}/collections/${collection}/points`;
201
202        const points = [];
203        for (const chunk of content.chunks) {
204            const embedding = await generateEmbedding(chunk.text);
205            points.push({
206                id: chunk.id,
207                vector: embedding,
208                payload: {
209                    text: chunk.text,
210                    ...chunk.metadata
211                }
212            });
213        }
214
215        return await GM.xmlhttpRequest({
216            method: 'PUT',
217            url: endpoint,
218            headers: {
219                'api-key': apiKey,
220                'Content-Type': 'application/json'
221            },
222            data: JSON.stringify({ points })
223        });
224    }
225
226    // Export to Chroma
227    async function exportToChroma(content, config) {
228        const { host, collection } = config;
229        const endpoint = `${host}/api/v1/collections/${collection}/add`;
230
231        const ids = [];
232        const documents = [];
233        const metadatas = [];
234        const embeddings = [];
235
236        for (const chunk of content.chunks) {
237            const embedding = await generateEmbedding(chunk.text);
238            ids.push(chunk.id);
239            documents.push(chunk.text);
240            metadatas.push(chunk.metadata);
241            embeddings.push(embedding);
242        }
243
244        return await GM.xmlhttpRequest({
245            method: 'POST',
246            url: endpoint,
247            headers: {
248                'Content-Type': 'application/json'
249            },
250            data: JSON.stringify({
251                ids,
252                documents,
253                metadatas,
254                embeddings
255            })
256        });
257    }
258
259    // Main export function
260    async function exportToVectorDB(database, config) {
261        try {
262            console.log(`Starting export to ${database}...`);
263            const content = await extractPageContent();
264            
265            let response;
266            switch (database) {
267            case 'pinecone':
268                response = await exportToPinecone(content, config);
269                break;
270            case 'weaviate':
271                response = await exportToWeaviate(content, config);
272                break;
273            case 'qdrant':
274                response = await exportToQdrant(content, config);
275                break;
276            case 'chroma':
277                response = await exportToChroma(content, config);
278                break;
279            default:
280                throw new Error('Unsupported database');
281            }
282
283            console.log('Export successful:', response);
284            return { success: true, chunksExported: content.chunks.length };
285        } catch (error) {
286            console.error('Export failed:', error);
287            throw error;
288        }
289    }
290
291    // Create UI
292    function createUI() {
293        // Check if UI already exists
294        if (document.getElementById('vector-db-exporter')) {
295            return;
296        }
297
298        const container = document.createElement('div');
299        container.id = 'vector-db-exporter';
300        container.innerHTML = `
301            <div id="vdb-panel" style="display: none;">
302                <div id="vdb-header">
303                    <h3>Export to Vector Database</h3>
304                    <button id="vdb-close">×</button>
305                </div>
306                <div id="vdb-content">
307                    <div class="vdb-section">
308                        <label>Select Database:</label>
309                        <select id="vdb-database">
310                            <option value="">-- Choose Database --</option>
311                            <option value="pinecone">Pinecone</option>
312                            <option value="weaviate">Weaviate</option>
313                            <option value="qdrant">Qdrant</option>
314                            <option value="chroma">Chroma</option>
315                        </select>
316                    </div>
317                    
318                    <div id="vdb-config" style="display: none;">
319                        <!-- Pinecone Config -->
320                        <div id="config-pinecone" class="db-config" style="display: none;">
321                            <div class="vdb-section">
322                                <label>API Key:</label>
323                                <input type="password" id="pinecone-apikey" placeholder="Enter Pinecone API Key">
324                            </div>
325                            <div class="vdb-section">
326                                <label>Index Name:</label>
327                                <input type="text" id="pinecone-index" placeholder="my-index">
328                            </div>
329                            <div class="vdb-section">
330                                <label>Project ID:</label>
331                                <input type="text" id="pinecone-project" placeholder="abc123">
332                            </div>
333                            <div class="vdb-section">
334                                <label>Environment:</label>
335                                <input type="text" id="pinecone-env" placeholder="us-east-1-aws">
336                            </div>
337                        </div>
338
339                        <!-- Weaviate Config -->
340                        <div id="config-weaviate" class="db-config" style="display: none;">
341                            <div class="vdb-section">
342                                <label>Host URL:</label>
343                                <input type="text" id="weaviate-host" placeholder="http://localhost:8080">
344                            </div>
345                            <div class="vdb-section">
346                                <label>Class Name:</label>
347                                <input type="text" id="weaviate-class" placeholder="Article">
348                            </div>
349                            <div class="vdb-section">
350                                <label>API Key (Optional):</label>
351                                <input type="password" id="weaviate-apikey" placeholder="Optional API Key">
352                            </div>
353                        </div>
354
355                        <!-- Qdrant Config -->
356                        <div id="config-qdrant" class="db-config" style="display: none;">
357                            <div class="vdb-section">
358                                <label>Host URL:</label>
359                                <input type="text" id="qdrant-host" placeholder="http://localhost:6333">
360                            </div>
361                            <div class="vdb-section">
362                                <label>Collection Name:</label>
363                                <input type="text" id="qdrant-collection" placeholder="my-collection">
364                            </div>
365                            <div class="vdb-section">
366                                <label>API Key:</label>
367                                <input type="password" id="qdrant-apikey" placeholder="Enter Qdrant API Key">
368                            </div>
369                        </div>
370
371                        <!-- Chroma Config -->
372                        <div id="config-chroma" class="db-config" style="display: none;">
373                            <div class="vdb-section">
374                                <label>Host URL:</label>
375                                <input type="text" id="chroma-host" placeholder="http://localhost:8000">
376                            </div>
377                            <div class="vdb-section">
378                                <label>Collection Name:</label>
379                                <input type="text" id="chroma-collection" placeholder="my-collection">
380                            </div>
381                        </div>
382                    </div>
383
384                    <div id="vdb-status" style="display: none;"></div>
385                    
386                    <div class="vdb-actions">
387                        <button id="vdb-export" disabled>Export Content</button>
388                        <button id="vdb-preview">Preview Content</button>
389                    </div>
390                </div>
391            </div>
392            <button id="vdb-toggle">📤 Export to Vector DB</button>
393        `;
394
395        // Add styles
396        const style = document.createElement('style');
397        style.textContent = `
398            #vector-db-exporter {
399                position: fixed;
400                bottom: 20px;
401                right: 20px;
402                z-index: 999999;
403                font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
404            }
405
406            #vdb-toggle {
407                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
408                color: white;
409                border: none;
410                padding: 12px 20px;
411                border-radius: 8px;
412                cursor: pointer;
413                font-size: 14px;
414                font-weight: 600;
415                box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4);
416                transition: all 0.3s ease;
417            }
418
419            #vdb-toggle:hover {
420                transform: translateY(-2px);
421                box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6);
422            }
423
424            #vdb-panel {
425                position: fixed;
426                bottom: 80px;
427                right: 20px;
428                width: 400px;
429                max-height: 600px;
430                background: white;
431                border-radius: 12px;
432                box-shadow: 0 10px 40px rgba(0, 0, 0, 0.2);
433                overflow: hidden;
434                animation: slideUp 0.3s ease;
435            }
436
437            @keyframes slideUp {
438                from {
439                    opacity: 0;
440                    transform: translateY(20px);
441                }
442                to {
443                    opacity: 1;
444                    transform: translateY(0);
445                }
446            }
447
448            #vdb-header {
449                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
450                color: white;
451                padding: 16px 20px;
452                display: flex;
453                justify-content: space-between;
454                align-items: center;
455            }
456
457            #vdb-header h3 {
458                margin: 0;
459                font-size: 16px;
460                font-weight: 600;
461            }
462
463            #vdb-close {
464                background: rgba(255, 255, 255, 0.2);
465                border: none;
466                color: white;
467                font-size: 24px;
468                width: 32px;
469                height: 32px;
470                border-radius: 6px;
471                cursor: pointer;
472                display: flex;
473                align-items: center;
474                justify-content: center;
475                transition: background 0.2s;
476            }
477
478            #vdb-close:hover {
479                background: rgba(255, 255, 255, 0.3);
480            }
481
482            #vdb-content {
483                padding: 20px;
484                max-height: 500px;
485                overflow-y: auto;
486            }
487
488            .vdb-section {
489                margin-bottom: 16px;
490            }
491
492            .vdb-section label {
493                display: block;
494                margin-bottom: 6px;
495                font-size: 13px;
496                font-weight: 600;
497                color: #333;
498            }
499
500            .vdb-section input,
501            .vdb-section select {
502                width: 100%;
503                padding: 10px 12px;
504                border: 2px solid #e0e0e0;
505                border-radius: 6px;
506                font-size: 14px;
507                transition: border-color 0.2s;
508                box-sizing: border-box;
509            }
510
511            .vdb-section input:focus,
512            .vdb-section select:focus {
513                outline: none;
514                border-color: #667eea;
515            }
516
517            .vdb-actions {
518                display: flex;
519                gap: 10px;
520                margin-top: 20px;
521            }
522
523            .vdb-actions button {
524                flex: 1;
525                padding: 12px;
526                border: none;
527                border-radius: 6px;
528                font-size: 14px;
529                font-weight: 600;
530                cursor: pointer;
531                transition: all 0.2s;
532            }
533
534            #vdb-export {
535                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
536                color: white;
537            }
538
539            #vdb-export:not(:disabled):hover {
540                transform: translateY(-1px);
541                box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
542            }
543
544            #vdb-export:disabled {
545                background: #ccc;
546                cursor: not-allowed;
547            }
548
549            #vdb-preview {
550                background: white;
551                color: #667eea;
552                border: 2px solid #667eea;
553            }
554
555            #vdb-preview:hover {
556                background: #f5f7ff;
557            }
558
559            #vdb-status {
560                margin-top: 16px;
561                padding: 12px;
562                border-radius: 6px;
563                font-size: 13px;
564            }
565
566            #vdb-status.success {
567                background: #d4edda;
568                color: #155724;
569                border: 1px solid #c3e6cb;
570            }
571
572            #vdb-status.error {
573                background: #f8d7da;
574                color: #721c24;
575                border: 1px solid #f5c6cb;
576            }
577
578            #vdb-status.loading {
579                background: #d1ecf1;
580                color: #0c5460;
581                border: 1px solid #bee5eb;
582            }
583        `;
584        document.head.appendChild(style);
585        document.body.appendChild(container);
586
587        // Event listeners
588        const toggle = document.getElementById('vdb-toggle');
589        const panel = document.getElementById('vdb-panel');
590        const closeBtn = document.getElementById('vdb-close');
591        const dbSelect = document.getElementById('vdb-database');
592        const exportBtn = document.getElementById('vdb-export');
593        const previewBtn = document.getElementById('vdb-preview');
594
595        toggle.addEventListener('click', () => {
596            panel.style.display = panel.style.display === 'none' ? 'block' : 'none';
597        });
598
599        closeBtn.addEventListener('click', () => {
600            panel.style.display = 'none';
601        });
602
603        dbSelect.addEventListener('change', async (e) => {
604            const database = e.target.value;
605            const configDiv = document.getElementById('vdb-config');
606            const allConfigs = document.querySelectorAll('.db-config');
607            
608            allConfigs.forEach(config => config.style.display = 'none');
609            
610            if (database) {
611                configDiv.style.display = 'block';
612                document.getElementById(`config-${database}`).style.display = 'block';
613                exportBtn.disabled = false;
614
615                // Load saved config
616                const savedConfig = await GM.getValue(`vdb_config_${database}`, {});
617                if (Object.keys(savedConfig).length > 0) {
618                    Object.keys(savedConfig).forEach(key => {
619                        const input = document.getElementById(`${database}-${key}`);
620                        if (input) input.value = savedConfig[key];
621                    });
622                }
623            } else {
624                configDiv.style.display = 'none';
625                exportBtn.disabled = true;
626            }
627        });
628
629        previewBtn.addEventListener('click', async () => {
630            try {
631                showStatus('Extracting content...', 'loading');
632                const content = await extractPageContent();
633                
634                // Create preview data and open in new tab using GM.openInTab
635                let previewHTML = `<!DOCTYPE html>
636                <html>
637                <head>
638                    <title>Content Preview - Vector DB Exporter</title>
639                    <style>
640                        body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 0; padding: 20px; background: #f5f7ff; }
641                        .container { max-width: 900px; margin: 0 auto; background: white; padding: 30px; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1); }
642                        h2 { color: #667eea; margin-bottom: 20px; }
643                        h3 { color: #764ba2; margin-top: 30px; }
644                        .info { margin: 10px 0; }
645                        .info strong { color: #333; }
646                        hr { margin: 20px 0; border: none; border-top: 2px solid #e0e0e0; }
647                        .chunk { margin: 15px 0; padding: 15px; background: #f5f7ff; border-left: 4px solid #667eea; border-radius: 4px; }
648                        .chunk strong { color: #667eea; }
649                        .chunk p { margin: 10px 0; color: #333; line-height: 1.6; }
650                        .chunk small { color: #666; }
651                    </style>
652                </head>
653                <body>
654                    <div class="container">
655                        <h2>Content Preview</h2>
656                        <div class="info"><strong>URL:</strong> ${content.url}</div>
657                        <div class="info"><strong>Title:</strong> ${content.title}</div>
658                        <div class="info"><strong>Chunks:</strong> ${content.chunks.length}</div>
659                        <hr>
660                        <h3>Content Chunks:</h3>
661                `;
662                
663                content.chunks.forEach((chunk, i) => {
664                    const displayText = chunk.text.length > 300 ? chunk.text.substring(0, 300) + '...' : chunk.text;
665                    previewHTML += `
666                        <div class="chunk">
667                            <strong>Chunk ${i + 1}:</strong>
668                            <p>${displayText.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</p>
669                            <small>Metadata: ${JSON.stringify(chunk.metadata).replace(/</g, '&lt;').replace(/>/g, '&gt;')}</small>
670                        </div>
671                    `;
672                });
673                
674                previewHTML += `
675                    </div>
676                </body>
677                </html>`;
678                
679                // Create a data URL and open it
680                const dataUrl = 'data:text/html;charset=utf-8,' + encodeURIComponent(previewHTML);
681                await GM.openInTab(dataUrl, false);
682                
683                showStatus(`✓ Preview opened! Found ${content.chunks.length} content chunks`, 'success');
684            } catch (error) {
685                console.error('Preview error:', error);
686                showStatus(`Preview failed: ${error.message}`, 'error');
687            }
688        });
689
690        exportBtn.addEventListener('click', async () => {
691            const database = dbSelect.value;
692            if (!database) return;
693
694            try {
695                showStatus('Preparing export...', 'loading');
696                
697                // Collect config
698                const config = {};
699                const configInputs = document.querySelectorAll(`#config-${database} input`);
700                configInputs.forEach(input => {
701                    const key = input.id.replace(`${database}-`, '');
702                    config[key] = input.value;
703                });
704
705                // Save config
706                await GM.setValue(`vdb_config_${database}`, config);
707
708                showStatus('Extracting and exporting content...', 'loading');
709                const result = await exportToVectorDB(database, config);
710                
711                showStatus(`✓ Successfully exported ${result.chunksExported} chunks to ${VECTOR_DATABASES[database].name}!`, 'success');
712            } catch (error) {
713                showStatus(`✗ Export failed: ${error.message}`, 'error');
714            }
715        });
716
717        function showStatus(message, type) {
718            const status = document.getElementById('vdb-status');
719            status.textContent = message;
720            status.className = type;
721            status.style.display = 'block';
722            
723            if (type === 'success' || type === 'error') {
724                setTimeout(() => {
725                    status.style.display = 'none';
726                }, 5000);
727            }
728        }
729    }
730
731    // Initialize
732    function init() {
733        if (document.readyState === 'loading') {
734            document.addEventListener('DOMContentLoaded', createUI);
735        } else {
736            createUI();
737        }
738    }
739
740    init();
741})();
Vector Database Content Exporter | Robomonkey