import requests
from bs4 import BeautifulSoup
# Cleans text (removes any punctuation)
def CleanText(text):
text = str(text)
forbidden = [r'\n', r'.', r'?', r'!', r'(', r')']
for i in forbidden:
text.replace(i, '')
return text
# returns count of a word from a page
def ReturnCount(url, word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = str(soup.find(text=lambda text: text and word in text))
words = CleanText(words.lower())
words = words.split()
return words.count(word.lower())
I'm trying to get the frequency (occurrence) of a particular word on a webpage. However, I'm always getting 0 as the output.
the count as 0, despite having multiple occurrences of that word Output: 0
There are different things going on:
text
with find()
, that only get the first occurrencefind_all()
that gets all occurrenceslambda
is doing theresoup.body.find_all(text=True)
get all the text
from inside the body
tag, process the list and join by ''.join([t for t in soup.body.find_all(text=True)])
to a string
Example:
import requests
from bs4 import BeautifulSoup
# Cleans text (removes any punctuation)
def CleanText(text):
text = str(text)
forbidden = [r'\n', r'.', r'?', r'!', r'(', r')']
for i in forbidden:
text.replace(i, '')
return text
# returns count of a word from a page
def ReturnCount(url, word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = ''.join([t for t in soup.body.find_all(text=True)])
words = CleanText(words.lower())
words = words.split()
return words.count(word.lower())
ReturnCount('https://www.microsoft.com/de-de/microsoft-365/word','word')
Output: 13
I'm not realy sure why you need
words = str(soup.find(text=lambda text: text and word in text))
make it simple
def ReturnCount(url, word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'html.parser')
words = CleanText(soup.text.lower())
words = words.split()
return words.count(word.lower())
wordCount = ReturnCount('http://example.com/', 'in')
print(wordCount) # 3
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.