I tried everything I could think of. How do I fix this?
Here is What I am trying to do:
I am a complete beginner...
from requests import get
from bs4 import BeautifulSoup
import re
site = "https://readlightnovel.org/"
r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
category = soup.findAll(class_="search-by-genre")
#Getting all categories
categories = []
for link in soup.findAll(href=re.compile(r'/category/\w+$')):
print("Category:", link.text)
category_link = site + "category/" + link.text
categories.append(category_link)
#break
#Getting all Novel Headers
for category in categories:
r = get(category_link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
Novels_header = soup.findAll(class_="top-novel-header")
#Getting Novels' Title and Link
for Novel_names in Novels_header:
print(Novel_names.text)
r = get(Novel_names, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
Novel_link = soup.findAll("a", {"href"})
print(Novel_link.text)
You are nearly there. Your pulling the text, not the url with category_link = site + "category/" + link.text
if you print 'link
', you'll see <a href="https://www.readlightnovel.org/category/action">Action</a>
you'd probably be better off getting the href
as opposed to the text. if the text doesn't match, then you have an unknown link.
Secondly, you need the links from the Novels_header, which is already there....no need to make another request (which wouldn't work anyway as you are not actually feeding in an url at that point).
Give this a try, and see wher eI made the edits:
from requests import get
from bs4 import BeautifulSoup
import re
site = "https://readlightnovel.org/"
r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
category = soup.findAll(class_="search-by-genre")
#Getting all categories
categories = []
for link in soup.findAll(href=re.compile(r'/category/\w+$')):
print("Category:", link.text)
category_link = link['href'] #<--- made edit here
categories.append(category_link)
#break
#Getting all Novel Headers
for category in categories:
r = get(category_link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
soup = BeautifulSoup(r.text, "lxml")
Novels_header = soup.findAll(class_="top-novel-header")
#Getting Novels' Title and Link
for Novel_names in Novels_header: #<---- edit in this block
print(Novel_names.text.strip())
Novel_link = Novel_names.find('a')['href']
print(Novel_link + '\n')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.