小红书批量数据采集器

自动滚动加载内容并批量采集数据,支持导出JSON格式

Size

12.0 KB

Version

1.0.1

Created

Nov 21, 2025

Updated

22 days ago

1// ==UserScript==
2// @name		小红书批量数据采集器
3// @description		自动滚动加载内容并批量采集数据,支持导出JSON格式
4// @version		1.0.1
5// @match		https://*.xiaohongshu.com/*
6// @icon		https://fe-video-qc.xhscdn.com/fe-platform/ed8fe781ce9e16c1bfac2cd962f0721edabe2e49.ico
7// ==/UserScript==
8(function() {
9    'use strict';
10
11    // 采集的数据存储
12    let collectedData = [];
13    let isCollecting = false;
14    let autoScrollInterval = null;
15
16    // 防抖函数
17    function debounce(func, wait) {
18        let timeout;
19        return function executedFunction(...args) {
20            const later = () => {
21                clearTimeout(timeout);
22                func(...args);
23            };
24            clearTimeout(timeout);
25            timeout = setTimeout(later, wait);
26        };
27    }
28
29    // 采集单个笔记数据
30    function collectNoteData(noteElement) {
31        try {
32            const noteId = noteElement.getAttribute('data-note-id') || 
33                          noteElement.querySelector('a')?.href?.match(/\/explore\/([a-zA-Z0-9]+)/)?.[1] || 
34                          'unknown_' + Date.now();
35            
36            // 检查是否已采集
37            if (collectedData.some(item => item.id === noteId)) {
38                return null;
39            }
40
41            const titleElement = noteElement.querySelector('.title, .note-title, [class*="title"]');
42            const authorElement = noteElement.querySelector('.author, .name, [class*="author"], [class*="name"]');
43            const linkElement = noteElement.querySelector('a[href*="/explore/"]');
44            const imageElement = noteElement.querySelector('img');
45            const likeElement = noteElement.querySelector('[class*="like"], [class*="count"]');
46
47            const data = {
48                id: noteId,
49                title: titleElement?.textContent?.trim() || '无标题',
50                author: authorElement?.textContent?.trim() || '未知作者',
51                link: linkElement?.href || '',
52                image: imageElement?.src || '',
53                likes: likeElement?.textContent?.trim() || '0',
54                collectedAt: new Date().toISOString()
55            };
56
57            console.log('采集到笔记数据:', data);
58            return data;
59        } catch (error) {
60            console.error('采集笔记数据失败:', error);
61            return null;
62        }
63    }
64
65    // 扫描页面并采集数据
66    function scanAndCollect() {
67        const noteSelectors = [
68            'section.note-item',
69            '.note-item',
70            '[class*="note-item"]',
71            '[class*="feed-item"]',
72            'a[href*="/explore/"]'
73        ];
74
75        let notes = [];
76        for (const selector of noteSelectors) {
77            const elements = document.querySelectorAll(selector);
78            if (elements.length > 0) {
79                notes = Array.from(elements);
80                console.log(`找到 ${notes.length} 个笔记元素,使用选择器: ${selector}`);
81                break;
82            }
83        }
84
85        let newCount = 0;
86        notes.forEach(note => {
87            const data = collectNoteData(note);
88            if (data) {
89                collectedData.push(data);
90                newCount++;
91            }
92        });
93
94        if (newCount > 0) {
95            updateUI();
96            console.log(`新采集 ${newCount} 条数据,总计 ${collectedData.length}`);
97        }
98    }
99
100    // 自动滚动
101    function autoScroll() {
102        const scrollHeight = document.documentElement.scrollHeight;
103        const currentScroll = window.pageYOffset + window.innerHeight;
104        
105        if (currentScroll < scrollHeight - 100) {
106            window.scrollBy({
107                top: 800,
108                behavior: 'smooth'
109            });
110            console.log('自动滚动中...');
111        } else {
112            console.log('已滚动到底部,等待加载更多内容...');
113            // 等待新内容加载
114            setTimeout(() => {
115                const newScrollHeight = document.documentElement.scrollHeight;
116                if (newScrollHeight === scrollHeight) {
117                    console.log('没有更多内容了');
118                }
119            }, 2000);
120        }
121    }
122
123    // 开始采集
124    function startCollecting() {
125        if (isCollecting) return;
126        
127        isCollecting = true;
128        console.log('开始批量采集数据...');
129        
130        // 立即扫描一次
131        scanAndCollect();
132        
133        // 启动自动滚动
134        autoScrollInterval = setInterval(() => {
135            autoScroll();
136            scanAndCollect();
137        }, 3000);
138        
139        updateUI();
140    }
141
142    // 停止采集
143    function stopCollecting() {
144        isCollecting = false;
145        if (autoScrollInterval) {
146            clearInterval(autoScrollInterval);
147            autoScrollInterval = null;
148        }
149        console.log('停止采集');
150        updateUI();
151    }
152
153    // 导出JSON
154    function exportJSON() {
155        if (collectedData.length === 0) {
156            alert('没有数据可导出!请先采集数据。');
157            return;
158        }
159
160        const dataStr = JSON.stringify(collectedData, null, 2);
161        const blob = new Blob([dataStr], { type: 'application/json' });
162        const url = URL.createObjectURL(blob);
163        
164        const a = document.createElement('a');
165        a.href = url;
166        a.download = `xiaohongshu_data_${new Date().getTime()}.json`;
167        document.body.appendChild(a);
168        a.click();
169        document.body.removeChild(a);
170        URL.revokeObjectURL(url);
171        
172        console.log(`成功导出 ${collectedData.length} 条数据`);
173        alert(`成功导出 ${collectedData.length} 条数据!`);
174    }
175
176    // 清空数据
177    function clearData() {
178        if (confirm(`确定要清空已采集的 ${collectedData.length} 条数据吗?`)) {
179            collectedData = [];
180            updateUI();
181            console.log('数据已清空');
182        }
183    }
184
185    // 更新UI显示
186    function updateUI() {
187        const countElement = document.getElementById('xhs-collector-count');
188        const statusElement = document.getElementById('xhs-collector-status');
189        const toggleBtn = document.getElementById('xhs-collector-toggle');
190        
191        if (countElement) {
192            countElement.textContent = collectedData.length;
193        }
194        
195        if (statusElement) {
196            statusElement.textContent = isCollecting ? '采集中...' : '已停止';
197            statusElement.style.color = isCollecting ? '#00ff00' : '#999';
198        }
199        
200        if (toggleBtn) {
201            toggleBtn.textContent = isCollecting ? '⏸ 停止采集' : '▶ 开始采集';
202            toggleBtn.style.background = isCollecting ? '#ff4757' : '#2ed573';
203        }
204    }
205
206    // 创建控制面板
207    function createControlPanel() {
208        const panel = document.createElement('div');
209        panel.id = 'xhs-collector-panel';
210        panel.innerHTML = `
211            <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 15px; border-radius: 10px 10px 0 0; cursor: move; user-select: none;">
212                <div style="font-size: 16px; font-weight: bold; margin-bottom: 5px;">📊 数据采集器</div>
213                <div style="font-size: 12px; opacity: 0.9;">自动滚动 + 批量采集</div>
214            </div>
215            <div style="padding: 15px; background: white;">
216                <div style="display: flex; align-items: center; justify-content: space-between; margin-bottom: 15px; padding: 10px; background: #f8f9fa; border-radius: 8px;">
217                    <span style="font-size: 14px; color: #333;">已采集:</span>
218                    <span id="xhs-collector-count" style="font-size: 24px; font-weight: bold; color: #667eea;">0</span>
219                </div>
220                <div style="display: flex; align-items: center; justify-content: space-between; margin-bottom: 15px; padding: 8px; background: #f8f9fa; border-radius: 8px;">
221                    <span style="font-size: 13px; color: #666;">状态:</span>
222                    <span id="xhs-collector-status" style="font-size: 13px; color: #999;">已停止</span>
223                </div>
224                <button id="xhs-collector-toggle" style="width: 100%; padding: 12px; margin-bottom: 10px; background: #2ed573; color: white; border: none; border-radius: 8px; font-size: 14px; font-weight: bold; cursor: pointer; transition: all 0.3s;">
225                    ▶ 开始采集
226                </button>
227                <button id="xhs-collector-export" style="width: 100%; padding: 12px; margin-bottom: 10px; background: #5352ed; color: white; border: none; border-radius: 8px; font-size: 14px; font-weight: bold; cursor: pointer; transition: all 0.3s;">
228                    💾 导出JSON
229                </button>
230                <button id="xhs-collector-clear" style="width: 100%; padding: 10px; background: #ff6b6b; color: white; border: none; border-radius: 8px; font-size: 13px; cursor: pointer; transition: all 0.3s;">
231                    🗑️ 清空数据
232                </button>
233            </div>
234        `;
235
236        // 样式
237        panel.style.cssText = `
238            position: fixed;
239            top: 100px;
240            right: 20px;
241            width: 280px;
242            background: white;
243            border-radius: 10px;
244            box-shadow: 0 10px 40px rgba(0,0,0,0.2);
245            z-index: 999999;
246            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
247        `;
248
249        document.body.appendChild(panel);
250
251        // 添加按钮悬停效果
252        const style = document.createElement('style');
253        style.textContent = `
254            #xhs-collector-panel button:hover {
255                transform: translateY(-2px);
256                box-shadow: 0 4px 12px rgba(0,0,0,0.15);
257            }
258            #xhs-collector-panel button:active {
259                transform: translateY(0);
260            }
261        `;
262        document.head.appendChild(style);
263
264        // 绑定事件
265        document.getElementById('xhs-collector-toggle').addEventListener('click', () => {
266            if (isCollecting) {
267                stopCollecting();
268            } else {
269                startCollecting();
270            }
271        });
272
273        document.getElementById('xhs-collector-export').addEventListener('click', exportJSON);
274        document.getElementById('xhs-collector-clear').addEventListener('click', clearData);
275
276        // 拖拽功能
277        let isDragging = false;
278        let currentX;
279        let currentY;
280        let initialX;
281        let initialY;
282
283        const header = panel.querySelector('div');
284        header.addEventListener('mousedown', (e) => {
285            isDragging = true;
286            initialX = e.clientX - panel.offsetLeft;
287            initialY = e.clientY - panel.offsetTop;
288        });
289
290        document.addEventListener('mousemove', (e) => {
291            if (isDragging) {
292                e.preventDefault();
293                currentX = e.clientX - initialX;
294                currentY = e.clientY - initialY;
295                panel.style.left = currentX + 'px';
296                panel.style.top = currentY + 'px';
297                panel.style.right = 'auto';
298            }
299        });
300
301        document.addEventListener('mouseup', () => {
302            isDragging = false;
303        });
304
305        console.log('数据采集器控制面板已创建');
306    }
307
308    // 监听DOM变化
309    const observer = new MutationObserver(debounce(() => {
310        if (isCollecting) {
311            scanAndCollect();
312        }
313    }, 1000));
314
315    // 初始化
316    function init() {
317        console.log('小红书批量数据采集器已启动');
318        
319        // 等待页面加载完成
320        if (document.readyState === 'loading') {
321            document.addEventListener('DOMContentLoaded', () => {
322                setTimeout(createControlPanel, 1000);
323            });
324        } else {
325            setTimeout(createControlPanel, 1000);
326        }
327
328        // 开始监听DOM变化
329        observer.observe(document.body, {
330            childList: true,
331            subtree: true
332        });
333    }
334
335    init();
336})();
小红书批量数据采集器 | Robomonkey