Getting word count from webpage

Question

import requests
from bs4 import BeautifulSoup

# Cleans text (removes any punctuation)
def CleanText(text):
    text = str(text)
    forbidden = [r'\n', r'.', r'?', r'!', r'(', r')']
    for i in forbidden:
        text.replace(i, '')
    return text

# returns count of a word from a page
def ReturnCount(url, word):
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'lxml')
    words = str(soup.find(text=lambda text: text and word in text))
    words = CleanText(words.lower())
    words = words.split()
    return words.count(word.lower())

I'm trying to get the frequency (occurrence) of a particular word on a webpage. However, I'm always getting 0 as the output.

the count as 0, despite having multiple occurrences of that word Output: 0

Answer 1

There are different things going on:

You try to get all text with find() , that only get the first occurrence
Instead try to use find_all() that gets all occurrences
Not sure what your lambda is doing there
soup.body.find_all(text=True) get all the text from inside the body tag, process the list and join by ''.join([t for t in soup.body.find_all(text=True)]) to a string

Example:

import requests
from bs4 import BeautifulSoup

# Cleans text (removes any punctuation)
def CleanText(text):
    text = str(text)
    forbidden = [r'\n', r'.', r'?', r'!', r'(', r')']
    for i in forbidden:
        text.replace(i, '')
    return text

# returns count of a word from a page
def ReturnCount(url, word):
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'lxml')
    words = ''.join([t for t in soup.body.find_all(text=True)])
    words = CleanText(words.lower())
    words = words.split()
    return words.count(word.lower())

ReturnCount('https://www.microsoft.com/de-de/microsoft-365/word','word')

Output: 13

Answer 2

I'm not realy sure why you need

words = str(soup.find(text=lambda text: text and word in text))

make it simple

def ReturnCount(url, word):
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'html.parser')
    words = CleanText(soup.text.lower())
    words = words.split()
    return words.count(word.lower())

wordCount = ReturnCount('http://example.com/', 'in')
print(wordCount) # 3

Getting word count from webpage

Question

2 answers

solution1
0 2020-12-24 16:09:13

solution2
0 2020-12-25 04:37:54

Getting word count from webpage

Question

2 answers

solution1 0 2020-12-24 16:09:13

solution2 0 2020-12-25 04:37:54

solution1
0 2020-12-24 16:09:13

solution2
0 2020-12-25 04:37:54