from gazpacho import get, Soup
url = 'https://acusports.com/'
html = get(url)
soup = Soup(html)
links = soup.find('a', {'href': "roster"}, partial=True)
s=[link.attrs['href'] for link in links]
print(s)
output:
['/roster.aspx?path=baseball', '/roster.aspx?path=mbball', '/roster.aspx?path=cross',
'/roster.aspx?path=football', '/roster.aspx?path=mgolf', '/roster.aspx?path=mten',
'/roster.aspx?path=track', '/roster.aspx?path=wbball', '/roster.aspx?path=cross',
'/roster.aspx?path=wsoc', '/roster.aspx?path=softball', '/roster.aspx?path=wten',
'/roster.aspx?path=beach', '/roster.aspx?path=wvball', '/roster.aspx?path=track']
but I want sports/baseball/roster not /roster.aspx?path=baseball
I hope I have understood what you need. Here's how you can get the routes and parameters separately:
from gazpacho import get, Soup
def scrap(url: str, html_tag: str, component: dict):
"""
returns from url the scrapped component
"""
html = get(url)
soup = Soup(html)
links = soup.find(html_tag, component, partial=True)
return links
def extract_routes(links):
"""
Extracts the routes
"""
processed_links = [link.attrs['href'].split('.')[0] for link in links]
return processed_links
def extract_paths(links):
"""
Extracts the paths
"""
processed_links = [link.attrs['href'].split('=')[1] for link in links]
return processed_links
url = 'https://acusports.com/'
links = scrap(url=url, html_tag="a", component={'href': "roster"})
routes = extract_routes(links)
print(f"All routes: {routes}")
paths = extract_paths(links)
print(f"All paths: {paths}")
Example of execution
All routes: ['/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster', '/roster']
All paths: ['baseball', 'mbball', 'cross', 'football', 'mgolf', 'mten', 'track', 'wbball', 'cross', 'wsoc', 'softball', 'wten', 'beach', 'wvball', 'track']
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.