I want to create a python program that will find and download files from a list of pages, I have found that the Chrome extension Chrono Download Manager is quite effective at identifying various file types loaded on a page for downloading. I would like to try and recreate this functionality in my program so that I can customise where it gets the files from and what it does with them.
位于Chrono Download Manager的文件中是一个名为sniffer.js的文件(Sniffer是标识文件的扩展部分):
chrome.runtime.onMessage.addListener(function(a) {
switch (a.cmd) {
case "g_links":
a = function(a, g, h) {
if (a && g)
for (var c = 0, k = a.length; c < k; ++c) {
var e = a[c][g];
f.test(e) && void 0 == b[e] && (d.push({
url: e,
desc: a[c].download || "",
text: h ? a[c][h] || "" : "",
title: a[c].title || "",
referer: ""
}), b[e] = 1)
}
};
var d = [],
f = /^(((ftp|https?):\/\/)|(data:)).+$/i,
b = {};
a(document.links, "href", "text");
a(document.images, "src", "alt");
a(document.getElementsByTagName("audio"), "src");
a(document.getElementsByTagName("video"), "src");
a(document.getElementsByTagName("source"), "src");
a(document.getElementsByTagName("script"), "src");
a(document.getElementsByTagName("link"), "href");
0 < d.length && chrome.runtime.sendMessage({
cmd: "add_links",
data: d
})
}
});
document.addEventListener("mousedown", function(a) {
function d(a) {
for (; a && a.tagName;) {
if ("a" == a.tagName.toLowerCase()) return a;
a = a.parentNode
}
return null
}
function f(a) {
for (; a;) {
if (a.src) return a;
a = a.parentNode
}
return null
}
if (0 <= a.button && 2 >= a.button) {
var b = d(a.target);
b && b.href && "j" != b.href[0] && chrome.runtime.sendMessage({
cmd: "url_desc",
url: b.href,
download: (b.download || "").trim(),
text: (b.text || "").trim(),
title: (b.title || "").trim()
});
(a = f(a.target)) && a.src && chrome.runtime.sendMessage({
cmd: "url_desc",
url: a.src,
download: (a.download || "").trim(),
text: (a.alt || "").trim(),
title: (a.title || "").trim()
})
}
}, !0);
I think that the sniffer.js file is the part that enables the extension to find files, if I am correct would it be possible to recreate this (like the a(document.getElementsByTagName("video"),"src");
part) in some form using python, if so how would I do this and what should I read into? Do I use things like lxml, beautifulsoup and chromedriver?
您将能给予的任何帮助和指示将不胜感激。
利亚姆