在下面的代码中,“ self.htmlList”列表多次打印正文/单词,因此我无法取出最后一个列表值。
from urllib.request import urlopen, Request
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.htmlList = []
self.wordDict = {}
def handle_data(self, data):
words = data.strip().split()
for i in words:
if i.isalpha() == True: # Not Working
self.htmlList.append(i)
return self.htmlList
self.handleContent(self.htmlList)
def handleContent(self,data):
for word in data:
if word in self.wordDict:
self.wordDict[word] += 1
else:
self.wordDict[word] = 1
print(self.wordDict)
#return self.wordDict
if __name__ == "__main__":
url = 'http://www.shortreckonings.com'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
response = urlopen(Request(url,headers=headers))
htmlContent = response.read()
htmlContent = htmlContent.decode("utf-8")
response.close()
parser = MyHTMLParser()
parser.feed(htmlContent)