试图以明智的方式抓取文章类别,但停留在获取Novel名称及其URL上。
from requests import get
from bs4 import BeautifulSoup
import re
site = "https://readlightnovel.org/"
r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
html_content = r.content
soup = BeautifulSoup(html_content, "lxml")
category = soup.findAll(class_="search-by-genre")
all_categories = []
for link in soup.findAll(href=re.compile(r'/category/\w+$')):
urls = site + "category/" + link.text
all_categories.append(urls)
#print(urls)
for category in all_categories:
r = get(category, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"}).content
soup2 = BeautifulSoup(r, "lxml")
Novel_name = soup.findAll("headline")
print(Novel_name)
此代码将为您提供包含所有新颖标题的列表: