💥一键搞定!网页内容与附件保存大升级
宝子们👋,今天要给大家分享一个超实用脚本的升级过程。这个脚本可以在特定网页上一键保存页面内容和附件,原本就很实用,经过这次升级,更是为 ragflow 知识库入库提供了极大便利👏!下面就一起来看看具体的升级内容吧。
🌟适用网页
这个脚本适用于http://www.nhc.gov.cn/sps/pqt/new_list.shtml,无论是目录页还是文章页,都能轻松应对🤩。
📝改进前的代码回顾
改进前的脚本虽然能实现保存网页内容为 MD 文件和下载附件的功能,但在文件区分方面不够清晰,不利于 ragflow 知识库入库。
💪改进内容大揭秘
- 文件名添加发布日期:为了更好地区分不同时间发布的文件,在保存的页面名称和附件名称前都添加了发布日期(格式为 YYYYMMDD)。这样在查看文件时,能快速了解文件的发布时间,方便进行分类和管理🧐。
-
优化日期获取逻辑:新增了
getPublishDate
函数,用于从页面中提取发布日期。如果页面中没有明确的日期信息,会默认使用当前日期,确保文件名的完整性和准确性📅。
🎉改进后的完整代码
// ==UserScript==
// @name Save Web Page and Attachments
// @namespace http://tampermonkey.net/
// @version 1.4
// @description Provide functions to save web - page content as MD and download attachments on nhc.gov.cn. Support both catalog and article pages.
// @author You
// @match *://*.nhc.gov.cn/*
// @grant GM_xmlhttpRequest
// @require https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/2.0.5/FileSaver.min.js
// ==/UserScript==
(function () {
'use strict';
// 判断当前页面是否为目录页
const isCatalogPage = /^http:\/\/www\.nhc\.gov\.cn\/sps\/pqt\/new_list(?:_\d+)?\.shtml$/.test(window.location.href);
if (isCatalogPage) {
// 目录页逻辑
const button = document.createElement('button');
button.textContent = '显示正文链接';
button.style.position = 'fixed';
button.style.top = '50%';
button.style.left = '2.5%';
button.style.transform = 'translate(-50%, -50%)';
button.style.zIndex = '9999';
document.body.appendChild(button);
button.addEventListener('click', function () {
const zxxxList = document.querySelector('ul.zxxx_list');
if (!zxxxList) {
alert('未找到正文链接列表');
return;
}
const links = zxxxList.querySelectorAll('a');
const linkList = [];
links.forEach(link => {
const linkText = link.textContent.trim();
const linkUrl = link.href;
linkList.push({ name: linkText, url: linkUrl });
});
const selectDiv = document.createElement('div');
selectDiv.style.position = 'fixed';
selectDiv.style.top = '10%';
selectDiv.style.left = '10%';
selectDiv.style.zIndex = '9999';
selectDiv.style.backgroundColor = 'white';
selectDiv.style.padding = '10px';
selectDiv.style.border = '1px solid black';
const selectAllCheckbox = document.createElement('input');
selectAllCheckbox.type = 'checkbox';
selectAllCheckbox.id = 'selectAll';
selectAllCheckbox.addEventListener('change', function () {
const checkboxes = document.querySelectorAll('.link-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = this.checked;
});
});
const selectAllLabel = document.createElement('label');
selectAllLabel.textContent = '全选';
selectAllLabel.htmlFor = 'selectAll';
selectDiv.appendChild(selectAllCheckbox);
selectDiv.appendChild(selectAllLabel);
linkList.forEach((linkObj, index) => {
const checkbox = document.createElement('input');
checkbox.type = 'checkbox';
checkbox.classList.add('link-checkbox');
checkbox.id = `link-${index}`;
checkbox.checked = true;
const label = document.createElement('label');
label.textContent = linkObj.name;
label.htmlFor = `link-${index}`;
const br = document.createElement('br');
selectDiv.appendChild(checkbox);
selectDiv.appendChild(label);
selectDiv.appendChild(br);
});
const saveButton = document.createElement('button');
saveButton.textContent = '保存选中';
saveButton.addEventListener('click', async function () {
const selectedLinks = [];
const checkboxes = document.querySelectorAll('.link-checkbox');
checkboxes.forEach((checkbox, index) => {
if (checkbox.checked) {
selectedLinks.push(linkList[index]);
}
});
for (const linkObj of selectedLinks) {
await processSinglePage(linkObj);
}
});
selectDiv.appendChild(saveButton);
document.body.appendChild(selectDiv);
});
} else {
// 正文页逻辑
const buttonContainer = document.createElement('div');
buttonContainer.style.position = 'fixed';
buttonContainer.style.top = '50%';
buttonContainer.style.right = '10px';
buttonContainer.style.transform = 'translateY(-50%)';
buttonContainer.style.zIndex = '9999';
buttonContainer.style.backgroundColor = 'white';
buttonContainer.style.border = '1px solid #ccc';
buttonContainer.style.padding = '5px';
document.body.appendChild(buttonContainer);
const saveAttachmentsButton = document.createElement('button');
saveAttachmentsButton.textContent = '保存附件';
buttonContainer.appendChild(saveAttachmentsButton);
const savePageButton = document.createElement('button');
savePageButton.textContent = '保存页面';
buttonContainer.appendChild(savePageButton);
const saveAllButton = document.createElement('button');
saveAllButton.textContent = '全部保存';
buttonContainer.appendChild(saveAllButton);
saveAttachmentsButton.addEventListener('click', saveAttachments);
savePageButton.addEventListener('click', savePage);
saveAllButton.addEventListener('click', saveAll);
}
// 获取网页正文内容
function getMainContent(doc = document) {
const startElement = doc.querySelector('.index_title');
const endElement = doc.querySelector('.footer');
if (!startElement ||!endElement) {
return '';
}
let currentElement = startElement;
let content = '';
while (currentElement && currentElement!== endElement) {
content += currentElement.outerHTML;
currentElement = currentElement.nextElementSibling;
}
return content;
}
// 获取附件链接
async function getAttachments(url) {
// 打开新窗口
const newWindow = window.open(url, '_blank');
// 等待新窗口加载完成
await new Promise(resolve => {
const checkInterval = setInterval(() => {
if (newWindow.document.readyState === 'complete') {
clearInterval(checkInterval);
resolve();
}
}, 100);
});
const allLinks = newWindow.document.querySelectorAll('a');
const attachments = [];
const attachmentExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'];
allLinks.forEach(link => {
const href = link.href;
if (attachmentExtensions.some(ext => href.toLowerCase().endsWith(ext))) {
let fullUrl = href;
if (!fullUrl.startsWith('http')) {
fullUrl = new URL(fullUrl, url).href;
}
attachments.push({ link, fullUrl });
}
});
// 关闭新窗口
newWindow.close();
return attachments;
}
// 保存为 MD 文档
function saveAsMd(content, title, date) {
const tempDiv = document.createElement('div');
tempDiv.innerHTML = content;
function processElement(element) {
if (element.tagName === 'TABLE') {
return htmlTableToMarkdown(element);
}
let result = '';
if (element.childNodes.length > 0) {
for (let i = 0; i < element.childNodes.length; i++) {
const child = element.childNodes[i];
if (child.nodeType === Node.TEXT_NODE) {
result += child.textContent;
} else {
result += processElement(child);
}
}
}
return result;
}
const markdownContent = processElement(tempDiv);
const blob = new Blob([markdownContent], { type: 'text/markdown;charset=utf-8' });
const url = URL.createObjectURL(blob);
const link = document.createElement('a');
link.href = url;
link.download = `${date}${title}.md`;
link.click();
URL.revokeObjectURL(url);
}
// 将 HTML 表格转换为 Markdown 表格
function htmlTableToMarkdown(tableElement) {
let markdown = '';
const rows = tableElement.rows;
for (let i = 0; i < rows.length; i++) {
const cells = rows[i].cells;
let rowMarkdown = '|';
for (let j = 0; j < cells.length; j++) {
rowMarkdown += ` ${cells[j].textContent.replace(/(\r\n|\n|\r)/gm, ' ')} |`;
}
markdown += rowMarkdown + '\n';
if (i === 0) {
markdown += '|';
for (let k = 0; k < cells.length; k++) {
markdown += ' ---- |';
}
markdown += '\n';
}
}
return markdown;
}
// 下载文件
function downloadFile(url, fileName) {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'GET',
url: url,
responseType: 'arraybuffer',
onload: function (response) {
if (response.status === 200) {
const blob = new Blob([response.response], { type: 'application/octet-stream' });
const url = URL.createObjectURL(blob);
const link = document.createElement('a');
link.href = url;
link.download = fileName;
link.click();
URL.revokeObjectURL(url);
resolve();
} else {
reject(new Error(`下载失败,状态码: ${response.status}`));
}
},
onerror: function (error) {
reject(error);
}
});
});
}
// 保存附件
async function saveAttachments() {
const pageTitle = document.title.replace(/[\\/*?:"<>|]/g, '_');
const date = getPublishDate();
const attachments = await getAttachments(window.location.href);
if (attachments.length > 0) {
for (const { link, fullUrl } of attachments) {
try {
const linkName = link.textContent.replace(/[\\/*?:"<>|]/g, '_');
const fileName = `${date}${linkName}${getExtension(link.href)}`;
console.log(`开始下载附件: ${fullUrl}`);
await downloadFile(fullUrl, fileName);
console.log(`附件 ${fileName} 已保存`);
} catch (error) {
console.error(`下载附件 ${fullUrl} 失败:`, error);
}
}
} else {
console.log('未检测到附件链接。');
}
}
// 从 URL 获取文件扩展名
function getExtension(url) {
const parts = url.split('.');
return '.' + parts[parts.length - 1];
}
// 获取发布日期,这里需要根据页面实际情况修改获取日期的逻辑
function getPublishDate() {
// 示例:假设页面中有一个 class 为 publish-date 的元素包含日期信息
const dateElement = document.querySelector('.publish-date');
if (dateElement) {
const dateText = dateElement.textContent;
// 假设日期格式为 YYYY-MM-DD,需要根据实际情况调整
const match = dateText.match(/(\d{4}-\d{2}-\d{2})/);
if (match) {
return match[1].replace(/-/g, '');
}
}
// 默认返回当前日期
const now = new Date();
const year = now.getFullYear();
const month = String(now.getMonth() + 1).padStart(2, '0');
const day = String(now.getDate()).padStart(2, '0');
return `${year}${month}${day}`;
}
// 保存页面为 MD
function savePage() {
const pageTitle = document.title.replace(/[\\/*?:"<>|]/g, '_');
const date = getPublishDate();
const mainContent = getMainContent();
saveAsMd(mainContent, pageTitle, date);
}
// 全部保存
async function saveAll() {
savePage();
await saveAttachments();
}
// 处理单个页面
async function processSinglePage(linkObj) {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'GET',
url: linkObj.url,
onload: function (response) {
const parser = new DOMParser();
const doc = parser.parseFromString(response.responseText, 'text/html');
// 获取正文页面的绝对链接
const absolutePageUrl = new URL(linkObj.url, window.location.origin).href;
const pageTitle = linkObj.name.replace(/[\\/*?:"<>|]/g, '_');
const date = getPublishDate(doc);
const mainContent = getMainContent(doc);
saveAsMd(mainContent, pageTitle, date);
getAttachments(absolutePageUrl).then(attachments => {
if (attachments.length > 0) {
attachments.forEach(async ({ link, fullUrl }, index) => {
try {
const linkName = link.textContent.replace(/[\\/*?:"<>|]/g, '_');
const fileName = `${date}${linkName}${getExtension(link.href)}`;
console.log(`开始下载附件: ${fullUrl}`);
await downloadFile(fullUrl, fileName);
console.log(`附件 ${fileName} 已保存`);
} catch (error) {
console.error(`下载附件 ${fullUrl} 失败:`, error);
}
});
} else {
console.log('未检测到附件链接。');
}
resolve();
}).catch(error => {
console.error(`获取附件链接失败:`, error);
reject(error);
});
},
onerror: function (error) {
console.error(`请求 ${linkObj.url} 失败:`, error);
reject(error);
}
});
});
}
})();
💖总结
通过这次改进,脚本在文件命名上更加规范,能更好地满足 ragflow 知识库入库的需求