简体   繁体   中英

Extracting data from table in html with bs4 and Python

I used Beautiful Soup before a little bit but this time I need help. I can't extract certain parts of an html, i always get back "None"

I would like to scrape details from the following page: https://www.coingecko.com/de?page=1

All the visible content is not a problem, but if you hover with your mouse over certain numbers it gives you more detailed information, (which I want :-))

This is what i managed to put together

from bs4 import BeautifulSoup as soup
import request


response = requests.get(url)                       #request the html 
webpage = soup(fp, "html.parser")                  #parse the html


#extract the 'main Block' with information
developer = (coin.findAll("div", {"class" : "percent"}))    

The variable 'developer' is now a list. I plan to go through this list with a for loop. Right now the content of the element is looking like this:

<div class="percent" data-toggle="tooltip" data-placement="right" data-html="true" title="" data-original-title="<div style=&quot;text-align: left;  font-size: 12px;&quot;>
    <table>
      <tbody>
        <tr>
          <td>Abonnenten <i class=&quot;fa fa-reddit&quot;></i></td>
          <td style=&quot;text-align: right&quot;>629925</td>                #I want this number

          ...

Right now I'm not able to extract the Number 629925. Usually I would just write .text but this doesn't work here because it's not text.

I then tried to use the following (and many variations) which also only returns [ ]

print(developer[0].findAll("td"))

Could anyone help me and explain how to extract it?

I also had a quick look at lxml but I never used it before and couldn't get it working.

Any help is much appreciated

The desired 629925 number is a part of an HTML code that is inside an element's attribute value . So, you need to re-parse the HTML in the tooltip:

from bs4 import BeautifulSoup
import requests


url = "https://www.coingecko.com/de?page=1"
response = requests.get(url)                       #request the html
soup = BeautifulSoup(response.content, "html.parser")                  #parse the html

percent_html_data = soup.select_one("td.community .percent")['title']
percent_soup = BeautifulSoup(percent_html_data, "html.parser")
data = {
    row.td.get_text(strip=True): row("td")[1].get_text()
    for row in percent_soup.find_all("tr")
}
print(data)

Prints:

{u'Facebook Likes': u'36370', u'Abonnenten': u'629925', u'Twitter Follower': u'627476'}

You can further extend the solution for all the rows in the table:

for row in soup.select("#gecko-table tr")[1:]:
    coin_name = row.select_one(".coin-content-name").get_text()
    percent_html_data = row.select_one("td.community .percent")['title']

    percent_soup = BeautifulSoup(percent_html_data, "html.parser")
    data = {
        row.td.get_text(strip=True): row("td")[1].get_text()
        for row in percent_soup.find_all("tr")
    }
    print(coin_name, data["Abonnenten"])

Prints:

(u'Bitcoin', u'629925')
(u'Ripple', u'135263')
(u'Ethereum', u'247487')
...
(u'BlackCoin', u'7659')
(u'Shift', u'1028')

You can use the 'lxml' parameter and call object.text to get the percentages:

import requests
from bs4 import BeautifulSoup as soup
s = soup(requests.get('https://www.coingecko.com/de?page=1').text, 'lxml')
final_data = list(map(lambda x:x.strip('\n'), [i.text for i in s.find_all('div', {'class':'percent'})]))

Output:

[u'98%', u'85%', u'51%', u'91%', u'81%', u'84%', u'36%', u'85%', u'93%', u'74%', u'42%', u'84%', u'85%', u'65%', u'38%', u'76%', u'87%', u'59%', u'32%', u'74%', u'74%', u'69%', u'38%', u'74%', u'84%', u'65%', u'38%', u'73%', u'89%', u'63%', u'38%', u'73%', u'84%', u'60%', u'36%', u'73%', u'77%', u'65%', u'42%', u'73%', u'87%', u'54%', u'34%', u'71%', u'68%', u'54%', u'43%', u'71%', u'84%', u'53%', u'39%', u'71%', u'82%', u'47%', u'33%', u'70%', u'81%', u'64%', u'37%', u'69%', u'81%', u'49%', u'36%', u'69%', u'88%', u'47%', u'37%', u'68%', u'86%', u'54%', u'33%', u'68%', u'79%', u'65%', u'25%', u'68%', u'86%', u'45%', u'40%', u'68%', u'88%', u'52%', u'32%', u'67%', u'84%', u'65%', u'31%', u'67%', u'84%', u'49%', u'40%', u'66%', u'64%', u'53%', u'36%', u'65%', u'75%', u' 52%', u'32%', u'64%', u'77%', u'49%', u'39%', u'64%', u'82%', u'44%', u'34%', u'64%', u'65%', u'60%', u'33%', u'63%', u'78%', u'46%', u'33%', u'62%', u'82%', u'47%', u'34%', u'62%', u'80%', u'44%', u'32%', u'62%', u'71%', u'46%', u'32%', u'62%', u'86%', u'50%', u'51%', u'61%', u'60%', u'50%', u'36%', u'60%', u'74%', u'45%', u'29%', u'60%', u'86%', u'17%', u'50%', u'60%', u'56%', u'46%', u'29%', u'60%', u'80%', u'45%', u'32%', u'60%', u'79%', u'42%', u'31%', u'60%', u'84%', u'17%', u'32%', u'59%', u'84%', u'42%', u'30%', u'59%', u'63%', u'56%', u'33%', u'59%', u'54%', u'44%', u'39%', u'59%', u'77%', u'43%', u'33%', u'58%', u'79%', u'36%', u'27%', u'58%', u'53%', u'50%', u'39%', u'58%', u'73%', u'43%', u'29%', u'58%', u'69%', u'47%', u'32%', u'57%', u'66%', u'42%', u'28%', u'57%', u'73%', u'41%', u'29%', u'57%', u'67%', u'43%', u'29%', u'57%', u'75%', u'32%', u'35%', u'57%', u'83%', u'25%', u'29%', u'56%', u'0%', u'68%', u'27%', u'56%', u'70%', u'42%', u'30%', u'56%', u'80%', u'38%', u'27%', u'56%', u'80%', u'41%', u'34%', u'56%', u'57%', u'45%', u'32%', u'56%', u'71%', u'33%', u'35%', u'56%', u'59%', u'44%', u'29%', u'55%', u'68%', u'42%', u'27%', u'55%', u'54%', u'46%', u'34%', u'55%', u'60%', u'41%', u'27%', u'55%', u'31%', u'52%', u'27%', u'55%', u'80%', u'40%', u'30%', u'54%', u'74%', u'39%', u'28%', u'54%', u'55%', u'29%', u'36%', u'54%', u'62%', u'35%', u'29%', u'54%', u'41%', u'51%', u'37%', u'54%', u'61%', u'40%', u'37%', u'53%', u'0%', u'66%', u'80%', u'53%', u'61%', u'35%', u'28%', u'53%', u'73%', u'34%', u'36%', u'53%', u'65%', u'41%', u'32%', u'53%', u'78%', u'35%', u'28%', u'52%', u'67%', u'36%', u'32%', u'52%', u'82%', u'32%', u'42%', u'52%', u'61%', u'33%', u'27%', u'52%', u'76%', u'35%', u'27%', u'52%', u'37%', u'43%', u'35%', u'51%', u'76%', u'33%', u'26%', u'51%', u'57%', u'38%', u'28%', u'51%', u'54%', u'42%', u'27%', u'51%', u'35%', u'53%', u'37%', u'50%', u'75%', u'39%', u'33%', u'50%', u'49%', u'42%', u'30%', u'50%', u'43%', u'40%', u'29%', u'50%', u'53%', u'42%', u'29%', u'50%', u'38%', u'41%', u'38%', u'50%', u'66%', u'34%', u'28%', u'50%', u'73%', u'15%', u'29%', u'50%', u'49%', u'40%', u'29%', u'49%', u'56%', u'36%', u'40%', u'49%', u'48%', u'37%', u'30%', u'49%', u'71%', u'30%', u'28%', u'49%', u'73%', u'33%', u'34%', u'49%', u'61%', u'37%', u'26%', u'48%', u'38%', u'35%', u'27%', u'48%', u'42%', u'43%', u'32%', u'48%', u'51%', u'35%', u'38%', u'48%']

If you want all the data for each row, you can try this:

import requests
import re
from bs4 import BeautifulSoup as soup
s = soup(requests.get('https://www.coingecko.com/de?page=1').text, 'html.parser')
row_data = list(map(lambda x:re.sub('\n+', '*', x), [i.text for i in s.find_all('tr')]))
header = list(filter(lambda x:x and x != u' ', re.split('\*', row_data[0])))
new_rows = [[num, title_data, [price, perc], mkt_cap, liquity, dev, comm, pub_interest, total] for num, *title_data, price, perc, mkt_cap, liquity, dev, comm, pub_interest, total in [filter(lambda x:x and x != u' ', re.split('\*', i)) for i in row_data]]
final_data = [dict(zip(header[:-1], row)) for row in new_rows]

Example result:

{'#': '99', 'Kryptowährung/Münze': ['BLK', 'BlackCoin', 'BLK', 'Proof of Stake', '1,27 $', '34,14% ', '96.925.240 $', '37.353.623 $', '42%', '43%', '32%', '48%', '100', 'SHIFT', 'Shift', 'SHIFT', 'Dagger'], 'Kurs': ['10,33 $', '21,74% '], 'Marktkapitalisierung': '119.654.415 $', 'Liquidität': '1.472.370 $', 'Entwickler': '51%', 'Community': '35%', 'Gemeinwohl': '38%', 'Gesamt': '48%'}, {'#': '100', 'Kryptowährung/Münze': ['SHIFT', 'Shift', 'SHIFT', 'Dagger'], 'Kurs': ['10,33 $', '21,74% '], 'Marktkapitalisierung': '119.654.415 $', 'Liquidität': '1.472.370 $', 'Entwickler': '51%', 'Community': '35%', 'Gemeinwohl': '38%', 'Gesamt': '48%'}]

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM