坚持获取Novel标题及其在Python Web抓取中的链接

试图以明智的方式抓取文章类别,但停留在获取Novel名称及其URL上。

from requests import get
from bs4 import BeautifulSoup
import re

site = "https://readlightnovel.org/"
r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
html_content = r.content
soup = BeautifulSoup(html_content, "lxml")
category = soup.findAll(class_="search-by-genre")

all_categories = []
for link in soup.findAll(href=re.compile(r'/category/\w+$')):
urls = site + "category/" + link.text
all_categories.append(urls)
#print(urls)
for category in all_categories:
r = get(category, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"}).content
soup2 = BeautifulSoup(r, "lxml")
Novel_name = soup.findAll("headline")
print(Novel_name)

评论
完美的爱
完美的爱

此代码将为您提供包含所有新颖标题的列表:

site = "https://readlightnovel.org/"
req = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(req.content, 'lxml')
category = soup.findAll('h1', {'itemprop' : 'headline'})

titles = [title.get_text() for title in category]
print(titles)
点赞
评论