一键搞定!网页内容与附件保存大升级

💥一键搞定!网页内容与附件保存大升级

💥一键搞定!网页内容与附件保存大升级

宝子们👋,今天要给大家分享一个超实用脚本的升级过程。这个脚本可以在特定网页上一键保存页面内容和附件,原本就很实用,经过这次升级,更是为 ragflow 知识库入库提供了极大便利👏!下面就一起来看看具体的升级内容吧。

🌟适用网页

这个脚本适用于http://www.nhc.gov.cn/sps/pqt/new_list.shtml,无论是目录页还是文章页,都能轻松应对🤩。

📝改进前的代码回顾

改进前的脚本虽然能实现保存网页内容为 MD 文件和下载附件的功能,但在文件区分方面不够清晰,不利于 ragflow 知识库入库。

💪改进内容大揭秘

  • 文件名添加发布日期:为了更好地区分不同时间发布的文件,在保存的页面名称和附件名称前都添加了发布日期(格式为 YYYYMMDD)。这样在查看文件时,能快速了解文件的发布时间,方便进行分类和管理🧐。
  • 优化日期获取逻辑:新增了 getPublishDate 函数,用于从页面中提取发布日期。如果页面中没有明确的日期信息,会默认使用当前日期,确保文件名的完整性和准确性📅。

🎉改进后的完整代码


// ==UserScript==
// @name         Save Web Page and Attachments
// @namespace    http://tampermonkey.net/
// @version      1.4
// @description  Provide functions to save web - page content as MD and download attachments on nhc.gov.cn. Support both catalog and article pages.
// @author       You
// @match        *://*.nhc.gov.cn/*
// @grant        GM_xmlhttpRequest
// @require      https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/2.0.5/FileSaver.min.js
// ==/UserScript==

(function () {
    'use strict';

    // 判断当前页面是否为目录页
    const isCatalogPage = /^http:\/\/www\.nhc\.gov\.cn\/sps\/pqt\/new_list(?:_\d+)?\.shtml$/.test(window.location.href);

    if (isCatalogPage) {
        // 目录页逻辑
        const button = document.createElement('button');
        button.textContent = '显示正文链接';
        button.style.position = 'fixed';
        button.style.top = '50%';
        button.style.left = '2.5%';
        button.style.transform = 'translate(-50%, -50%)';
        button.style.zIndex = '9999';
        document.body.appendChild(button);

        button.addEventListener('click', function () {
            const zxxxList = document.querySelector('ul.zxxx_list');
            if (!zxxxList) {
                alert('未找到正文链接列表');
                return;
            }

            const links = zxxxList.querySelectorAll('a');
            const linkList = [];
            links.forEach(link => {
                const linkText = link.textContent.trim();
                const linkUrl = link.href;
                linkList.push({ name: linkText, url: linkUrl });
            });

            const selectDiv = document.createElement('div');
            selectDiv.style.position = 'fixed';
            selectDiv.style.top = '10%';
            selectDiv.style.left = '10%';
            selectDiv.style.zIndex = '9999';
            selectDiv.style.backgroundColor = 'white';
            selectDiv.style.padding = '10px';
            selectDiv.style.border = '1px solid black';

            const selectAllCheckbox = document.createElement('input');
            selectAllCheckbox.type = 'checkbox';
            selectAllCheckbox.id = 'selectAll';
            selectAllCheckbox.addEventListener('change', function () {
                const checkboxes = document.querySelectorAll('.link-checkbox');
                checkboxes.forEach(checkbox => {
                    checkbox.checked = this.checked;
                });
            });
            const selectAllLabel = document.createElement('label');
            selectAllLabel.textContent = '全选';
            selectAllLabel.htmlFor = 'selectAll';
            selectDiv.appendChild(selectAllCheckbox);
            selectDiv.appendChild(selectAllLabel);

            linkList.forEach((linkObj, index) => {
                const checkbox = document.createElement('input');
                checkbox.type = 'checkbox';
                checkbox.classList.add('link-checkbox');
                checkbox.id = `link-${index}`;
                checkbox.checked = true;

                const label = document.createElement('label');
                label.textContent = linkObj.name;
                label.htmlFor = `link-${index}`;

                const br = document.createElement('br');

                selectDiv.appendChild(checkbox);
                selectDiv.appendChild(label);
                selectDiv.appendChild(br);
            });

            const saveButton = document.createElement('button');
            saveButton.textContent = '保存选中';
            saveButton.addEventListener('click', async function () {
                const selectedLinks = [];
                const checkboxes = document.querySelectorAll('.link-checkbox');
                checkboxes.forEach((checkbox, index) => {
                    if (checkbox.checked) {
                        selectedLinks.push(linkList[index]);
                    }
                });
                for (const linkObj of selectedLinks) {
                    await processSinglePage(linkObj);
                }
            });
            selectDiv.appendChild(saveButton);
            document.body.appendChild(selectDiv);
        });
    } else {
        // 正文页逻辑
        const buttonContainer = document.createElement('div');
        buttonContainer.style.position = 'fixed';
        buttonContainer.style.top = '50%';
        buttonContainer.style.right = '10px';
        buttonContainer.style.transform = 'translateY(-50%)';
        buttonContainer.style.zIndex = '9999';
        buttonContainer.style.backgroundColor = 'white';
        buttonContainer.style.border = '1px solid #ccc';
        buttonContainer.style.padding = '5px';
        document.body.appendChild(buttonContainer);

        const saveAttachmentsButton = document.createElement('button');
        saveAttachmentsButton.textContent = '保存附件';
        buttonContainer.appendChild(saveAttachmentsButton);

        const savePageButton = document.createElement('button');
        savePageButton.textContent = '保存页面';
        buttonContainer.appendChild(savePageButton);

        const saveAllButton = document.createElement('button');
        saveAllButton.textContent = '全部保存';
        buttonContainer.appendChild(saveAllButton);

        saveAttachmentsButton.addEventListener('click', saveAttachments);
        savePageButton.addEventListener('click', savePage);
        saveAllButton.addEventListener('click', saveAll);
    }

    // 获取网页正文内容
    function getMainContent(doc = document) {
        const startElement = doc.querySelector('.index_title');
        const endElement = doc.querySelector('.footer');
        if (!startElement ||!endElement) {
            return '';
        }

        let currentElement = startElement;
        let content = '';
        while (currentElement && currentElement!== endElement) {
            content += currentElement.outerHTML;
            currentElement = currentElement.nextElementSibling;
        }
        return content;
    }

    // 获取附件链接
    async function getAttachments(url) {
        // 打开新窗口
        const newWindow = window.open(url, '_blank');
        // 等待新窗口加载完成
        await new Promise(resolve => {
            const checkInterval = setInterval(() => {
                if (newWindow.document.readyState === 'complete') {
                    clearInterval(checkInterval);
                    resolve();
                }
            }, 100);
        });

        const allLinks = newWindow.document.querySelectorAll('a');
        const attachments = [];
        const attachmentExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'];
        allLinks.forEach(link => {
            const href = link.href;
            if (attachmentExtensions.some(ext => href.toLowerCase().endsWith(ext))) {
                let fullUrl = href;
                if (!fullUrl.startsWith('http')) {
                    fullUrl = new URL(fullUrl, url).href;
                }
                attachments.push({ link, fullUrl });
            }
        });
        // 关闭新窗口
        newWindow.close();
        return attachments;
    }

    // 保存为 MD 文档
    function saveAsMd(content, title, date) {
        const tempDiv = document.createElement('div');
        tempDiv.innerHTML = content;

        function processElement(element) {
            if (element.tagName === 'TABLE') {
                return htmlTableToMarkdown(element);
            }
            let result = '';
            if (element.childNodes.length > 0) {
                for (let i = 0; i < element.childNodes.length; i++) {
                    const child = element.childNodes[i];
                    if (child.nodeType === Node.TEXT_NODE) {
                        result += child.textContent;
                    } else {
                        result += processElement(child);
                    }
                }
            }
            return result;
        }

        const markdownContent = processElement(tempDiv);
        const blob = new Blob([markdownContent], { type: 'text/markdown;charset=utf-8' });
        const url = URL.createObjectURL(blob);
        const link = document.createElement('a');
        link.href = url;
        link.download = `${date}${title}.md`;
        link.click();
        URL.revokeObjectURL(url);
    }

    // 将 HTML 表格转换为 Markdown 表格
    function htmlTableToMarkdown(tableElement) {
        let markdown = '';
        const rows = tableElement.rows;
        for (let i = 0; i < rows.length; i++) {
            const cells = rows[i].cells;
            let rowMarkdown = '|';
            for (let j = 0; j < cells.length; j++) {
                rowMarkdown += ` ${cells[j].textContent.replace(/(\r\n|\n|\r)/gm, ' ')} |`;
            }
            markdown += rowMarkdown + '\n';
            if (i === 0) {
                markdown += '|';
                for (let k = 0; k < cells.length; k++) {
                    markdown += ' ---- |';
                }
                markdown += '\n';
            }
        }
        return markdown;
    }

    // 下载文件
    function downloadFile(url, fileName) {
        return new Promise((resolve, reject) => {
            GM_xmlhttpRequest({
                method: 'GET',
                url: url,
                responseType: 'arraybuffer',
                onload: function (response) {
                    if (response.status === 200) {
                        const blob = new Blob([response.response], { type: 'application/octet-stream' });
                        const url = URL.createObjectURL(blob);
                        const link = document.createElement('a');
                        link.href = url;
                        link.download = fileName;
                        link.click();
                        URL.revokeObjectURL(url);
                        resolve();
                    } else {
                        reject(new Error(`下载失败,状态码: ${response.status}`));
                    }
                },
                onerror: function (error) {
                    reject(error);
                }
            });
        });
    }

    // 保存附件
    async function saveAttachments() {
        const pageTitle = document.title.replace(/[\\/*?:"<>|]/g, '_');
        const date = getPublishDate();
        const attachments = await getAttachments(window.location.href);
        if (attachments.length > 0) {
            for (const { link, fullUrl } of attachments) {
                try {
                    const linkName = link.textContent.replace(/[\\/*?:"<>|]/g, '_');
                    const fileName = `${date}${linkName}${getExtension(link.href)}`;
                    console.log(`开始下载附件: ${fullUrl}`);
                    await downloadFile(fullUrl, fileName);
                    console.log(`附件 ${fileName} 已保存`);
                } catch (error) {
                    console.error(`下载附件 ${fullUrl} 失败:`, error);
                }
            }
        } else {
            console.log('未检测到附件链接。');
        }
    }

    // 从 URL 获取文件扩展名
    function getExtension(url) {
        const parts = url.split('.');
        return '.' + parts[parts.length - 1];
    }

    // 获取发布日期,这里需要根据页面实际情况修改获取日期的逻辑
    function getPublishDate() {
        // 示例:假设页面中有一个 class 为 publish-date 的元素包含日期信息
        const dateElement = document.querySelector('.publish-date');
        if (dateElement) {
            const dateText = dateElement.textContent;
            // 假设日期格式为 YYYY-MM-DD,需要根据实际情况调整
            const match = dateText.match(/(\d{4}-\d{2}-\d{2})/);
            if (match) {
                return match[1].replace(/-/g, '');
            }
        }
        // 默认返回当前日期
        const now = new Date();
        const year = now.getFullYear();
        const month = String(now.getMonth() + 1).padStart(2, '0');
        const day = String(now.getDate()).padStart(2, '0');
        return `${year}${month}${day}`;
    }

    // 保存页面为 MD
    function savePage() {
        const pageTitle = document.title.replace(/[\\/*?:"<>|]/g, '_');
        const date = getPublishDate();
        const mainContent = getMainContent();
        saveAsMd(mainContent, pageTitle, date);
    }

    // 全部保存
    async function saveAll() {
        savePage();
        await saveAttachments();
    }

    // 处理单个页面
    async function processSinglePage(linkObj) {
        return new Promise((resolve, reject) => {
            GM_xmlhttpRequest({
                method: 'GET',
                url: linkObj.url,
                onload: function (response) {
                    const parser = new DOMParser();
                    const doc = parser.parseFromString(response.responseText, 'text/html');

                    // 获取正文页面的绝对链接
                    const absolutePageUrl = new URL(linkObj.url, window.location.origin).href;

                    const pageTitle = linkObj.name.replace(/[\\/*?:"<>|]/g, '_');
                    const date = getPublishDate(doc);
                    const mainContent = getMainContent(doc);
                    saveAsMd(mainContent, pageTitle, date);

                    getAttachments(absolutePageUrl).then(attachments => {
                        if (attachments.length > 0) {
                            attachments.forEach(async ({ link, fullUrl }, index) => {
                                try {
                                    const linkName = link.textContent.replace(/[\\/*?:"<>|]/g, '_');
                                    const fileName = `${date}${linkName}${getExtension(link.href)}`;
                                    console.log(`开始下载附件: ${fullUrl}`);
                                    await downloadFile(fullUrl, fileName);
                                    console.log(`附件 ${fileName} 已保存`);
                                } catch (error) {
                                    console.error(`下载附件 ${fullUrl} 失败:`, error);
                                }
                            });
                        } else {
                            console.log('未检测到附件链接。');
                        }
                        resolve();
                    }).catch(error => {
                        console.error(`获取附件链接失败:`, error);
                        reject(error);
                    });
                },
                onerror: function (error) {
                    console.error(`请求 ${linkObj.url} 失败:`, error);
                    reject(error);
                }
            });
        });
    }
})();
    

💖总结

通过这次改进,脚本在文件命名上更加规范,能更好地满足 ragflow 知识库入库的需求

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注