对 Python Beautifulsoup 提取的表格数据进行分组以提高可读性

Question

The following snippet is working but for the purpose of readability, I need help in formatting the result into screen.以下代码段正在运行，但为了可读性，我需要帮助将结果格式化到屏幕中。

from urllib.request import Request, urlopen,urljoin
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    print(str(link)[26:] +"\t" + str(token) + "\t\t" + str(value))

Current Output:电流输出：

0x154a9f9cbd3449ad22fdae23044319d6ef2a1fab   CryptoBlades... (SKILL)    0
0x46d502fac9aea7c5bc7b13c8ec9d02378c33d36f   WolfSafePoor... (WSPP)     532,654,321,110
0xb510e39a6cc3ebe999ff957ae7b5813d3326af88   GoldenBresco (GoBo)        0.1
0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c   Wrapped BNB (WBNB)         0.193446389516094066
0xb510e39a6cc3ebe999ff957ae7b5813d3326af88   GoldenBresco (GoBo)        0.003

Wanted Improvement: # grouping into 3 columns想要改进：# 分成 3 列

0x154a9f9cbd3449ad22fdae23044319d6ef2a1fab   CryptoBlades... (SKILL)    2.746949883778173559
                                             CryptoBlades... (SKILL)    0.971749999999999991
                                             CryptoBlades... (SKILL)    0

0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c   Wrapped BNB (WBNB)         0.1
                                             Wrapped BNB (WBNB)         0.193446389516094066
                                             Wrapped BNB (WBNB)         0.3

Answer 1

Try:尝试：

import requests
from bs4 import BeautifulSoup
from itertools import groupby

url = "https://bscscan.com/tokentxns"

soup = BeautifulSoup(requests.get(url).content, "html.parser")

data = []
for tr in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    _, txn_hash, tm, age, from_, _, to_, value, token = tds
    data.append((txn_hash, token, value))

data = sorted(data)
for _, g in groupby(data, lambda k: k[0]):
    g = list(map(list, g))
    for subl in g[1:]:
        subl[0] = ""

    for subl in g:
        print("{:<67} {:<27} {:<20}".format(*subl))
    print()

Prints:印刷：

0x0883f7ada1e30d266366577dbc46cd86a8deb737d669758a443ef03859ea551a  FEGtoken (FEG)              1,946,201,644.40754275
                                                                    Wrapped BNB (WBNB)          0.025356409113673479

0x41a7e28aa1f88522ba477718f9ea93d927bd8c456cd77c75691d961ac01da626  KOMOCOIN (KMC)              1,500               
                                                                    KOMOCOIN (KMC)              750                 

0x54bf03ddb42a151920fc2352a8419ed24720422b79c4956c74ab1d51aead142e  BABY CAKE (BABYCA...)       140.806276687606518422
                                                                    BABY CAKE (BABYCA...)       165.654443161890021673
                                                                    BABY CAKE (BABYCA...)       2,164,578.319665288243959287
                                                                    BABY CAKE (BABYCA...)       238.930554998160499529
                                                                    BABY CAKE (BABYCA...)       42.164215587910676387
                                                                    BABY CAKE (BABYCA...)       462,482.805614060076081865
                                                                    BABY CAKE (BABYCA...)       797.902234563103604395
                                                                    BABY CAKE (BABYCA...)       938.708511250710122817
                                                                    BABY CAKE PR...(BBCAKE...)  190,322,532.495690243057683413
                                                                    BABY CAKE PR...(BBCAKE...)  2,526,729.458161278746350005
                                                                    BABY CAKE PR...(BBCAKE...)  251,979.604709746169304594
                                                                    BABY CAKE PR...(BBCAKE...)  252,609.914806456810514054
                                                                    BABY CAKE PR...(BBCAKE...)  36,251,910.951560046296701602
                                                                    BABYCAKE_Div...(BABYCA...)  238.930554998160499529
                                                                    Pancake LPs (Cake-L...)     0.222139817418176568
                                                                    Pancake LPs (Cake-L...)     13.786493105169560097
                                                                    Pancake LPs (Cake-L...)     486.96534350290155168
                                                                    Pancake LPs (Cake-L...)     5.76850094907955108 
                                                                    PancakeSwap ...(Cake)       0.001286990618481616
                                                                    PancakeSwap ...(Cake)       0.112893929385320841
                                                                    PancakeSwap ...(Cake)       1.497338191475435628
                                                                    PancakeSwap ...(Cake)       61.821404790611192339
                                                                    PancakeSwap ...(Cake)       61.821404790611192339
                                                                    Wrapped BNB (WBNB)          0.000146050638113703
                                                                    Wrapped BNB (WBNB)          0.000146050638113703
                                                                    Wrapped BNB (WBNB)          0.00146079350317574 
                                                                    Wrapped BNB (WBNB)          0.109629866733835175
                                                                    Wrapped BNB (WBNB)          0.610745057130530703
                                                                    Wrapped BNB (WBNB)          2.850122532653068215

0x6cc6153aa387de6a56c905f7d424ec38f047fefdcc2b7d766c53db7807b6f562  CryptoBlades...(SKILL)      0.005999999999999999
                                                                    CryptoBlades...(SKILL)      0.06                

0x776a1edc9446cc3e160cb08a69e2824dab0e6df7b6c79f252a1c9a0de4733bd4  Arena Token (ARENA)         0.000802589119468346
                                                                    Arena Token (ARENA)         0.037402597402597402
                                                                    Arena Token (ARENA)         0.374025974025974025

0x7ca15e96d56d686d79a93271e192021fefed01187dce424bec835f1a6a47b937  CryptoBlades...(SKILL)      0.971749999999999991

0x7f6bada297def57a2d1823000d464923187bea376c5747ba6ebe0b63b1ae1850  CryptoBlades...(SKILL)      0                   

0x8ddaceff011648b2f13128c8ce4ff5654171878200e12f2ce8f9cf3ec4ab97a3  CryptoBlades...(SKILL)      0.051999999999999999
                                                                    CryptoBlades...(SKILL)      0.52                

0x91d299dc263ac4e30027c5e54e5a5fd4fd2fb814db7c0fc00643764f8710e47b  CryptoBlades...(SKILL)      0                   

0xa097fad173e3d6551e2a837048f40348ffcafc710ca13410de1fb532f2833ba7  Niubi Token (NIU)           2,152.08364390963091904
                                                                    Wrapped BNB (WBNB)          0.05                

0xf2c10ec09049cd810c3aac459b85b9bbbcbb53f3b78341d24af1cab585d6e1ba  Foxy Equilib...(Foxy)       0.9                 
                                                                    Foxy Equilib...(Foxy)       0.9                 
                                                                    Foxy Equilib...(Foxy)       7.2                 

0xf5b44e82e4e4509d59b51491ce1bfa44888fae2c11a65bd5021d2aed9c75afd4  CryptoBlades...(SKILL)      0.055005280975673767
                                                                    Wrapped BNB (WBNB)          0.022533425242910644

EDIT: To print token URL instead of name:编辑：打印令牌 URL 而不是名称：

import requests
from bs4 import BeautifulSoup
from itertools import groupby

url = "https://bscscan.com/tokentxns"

soup = BeautifulSoup(requests.get(url).content, "html.parser")

data = []
for tr in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    _, txn_hash, tm, age, from_, _, to_, value, token = tds
    a = "https://bscscan.com" + tr.select("a")[-1]["href"]
    data.append((txn_hash, a, value))

data = sorted(data)
for _, g in groupby(data, lambda k: k[0]):
    g = list(map(list, g))
    for subl in g[1:]:
        subl[0] = ""

    for subl in g:
        print("{:<67} {:<27} {:<20}".format(*subl))
    print()

Answer 2

I didn't get a response from you regarding the pastebin so here is the approach I was giving of simply styling a dataframe.我没有收到您关于 pastebin 的回复，所以这里是我给出的简单设置数据框样式的方法。 I said in the comments that it seems you really just want to order by the first column and then not repeat items within that column.我在评论中说，您似乎真的只想按第一列排序，然后不要重复该列中的项目。 You can do that sort_values() , and using duplicated() to replace duplicates with '' .您可以这样做sort_values() ，并使用duplicated()用''替换重复项。 I've borrowed Andrej's (upvoted) tidier syntax for populating the list of lists.我借用了 Andrej 的（赞成的）更简洁的语法来填充列表。

You can style the dataframe as you see fit.您可以根据需要设置数据框的样式。 I hid the borders between cells and set the background to white for example.例如，我隐藏了单元格之间的边界并将背景设置为白色。

import pandas as pd
from bs4 import BeautifulSoup
import requests, random

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
df_rows = []

for tr in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    _, txnhash, tm, age, from_, _, to_, value, token = tds
    df_rows.append([txnhash, token, value])
  
df = pd.DataFrame(df_rows, columns = ['hash',  'token', 'value'])
df['value'] = pd.to_numeric(df['value'].apply(lambda x: x.replace(',','')))
df.sort_values(['hash', 'token'], inplace = True)
df.hash = [i[1] if not i[0] else '' for i in zip(df.duplicated(subset=['hash']), df.hash)]
#df.reset_index(drop = True, inplace = True)
df.style.format(formatter={('value'): "{:,.3f}"}).hide_index() \
  .set_properties(**{'background-color': 'white', 'text-align': 'left'}, padding="10px", border='0px solid white') \
  .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

对 Python Beautifulsoup 提取的表格数据进行分组以提高可读性

问题描述

2 个解决方案

解决方案1
2 已采纳 2021-07-25 08:18:21

解决方案2
2 2021-07-25 12:34:55

对 Python Beautifulsoup 提取的表格数据进行分组以提高可读性

问题描述

2 个解决方案

解决方案1 2 已采纳 2021-07-25 08:18:21

解决方案2 2 2021-07-25 12:34:55

解决方案1
2 已采纳 2021-07-25 08:18:21

解决方案2
2 2021-07-25 12:34:55