简体   繁体   English

如何从 a 标签中提取一个 href

[英]How do I extract an href from the a tag

I need to extract the href as well as the text outside of each tag from the variable contents.我需要从变量内容中提取 href 以及每个标签之外的文本。 Any help would be much appreciated.任何帮助将非常感激。 Thanks in advance.提前致谢。

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs

def scrape():
    req = Request('https://www.muddywatersresearch.com/research/', headers={'User-Agent': 'Mozilla/5.0'})
    webpage = bs(urlopen(req).read(), 'html.parser')
    info = webpage.find_all("td", {"class": "first"})
    for B in info:
        contents = B.renderContents()
        print(contents)


scrape()
import requests
from bs4 import BeautifulSoup


def main(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    goal = [x.a.text for x in soup.select("td.first")]
    print(goal)


main("https://www.muddywatersresearch.com/research/")

Out:出去:

['MW is Short Joyy Inc. (YY US)', 'MultiPlan: Private Equity Necrophilia Meets The Great 2020 Money Grab (MPLN US)', 'MW is Short Nano-X Imaging Ltd. (NNOX US)', 'EHTH: Booking Revenues Today that it Will Collect in 2029 (EHTH US)', 'GSX: Small Classes, Big Lies (GSX US)', 'MW is Short GSX Techedu Inc. (GSX US)', 'Burford: Husbanding Liquidity, Wife-ing Financials (BUR LN)', 'MW is Short eHealth Inc. (EHTH US)', 'NMC: Undisclosed Share Pledges Intensify Questions About True Debt Levels (NMC.LN)', 'Burford: Horrible Second Half Results Validate our Thesis (BUR LN)', 'MW is 
Short NMC Health plc (NMC.LN)', 'Muddy Waters is Short PeptiDream, Inc. (4587.JP)', 'Burford: It Just Doesn’t Get Better (BUR LN)', 'Burford: Was MW Wrong About Napo? (BUR LN)', 'Behavioral Analysis of Burford’s Response Indicates Significant Deception (BUR LN)', 'Burford’s Response is Nothing More Than Distraction and Thin Excuses (BUR LN)', 'MW is Short Burford Capital Ltd. (BUR LN)', 'ANTA Part V: “Controlled Major Supplier and Another Fila Lie”', 'ANTA Part IV: “Liars Lie” (2020.HK)', 'ANTA Part III: FilaBuster (2020.HK)', 'ANTA Part II: “Mens Rea” (2020.HK)', 'ANTA Part I: Turds in the Punchbowl (2020.HK)', 'Muddy Waters publie une communication sur les derniers développements de Casino Guichard-Perrachon et ses sociétés mères', 'Muddy Waters Releases Statement on Developments with Casino Guichard-Perrachon and its Parent Companies', 'Credibility in the Balance: Six Key Questions for INGN Management (INGN US)', 'MW is Short Inogen, Inc. (INGN US)', 'MW is Short Manulife Financial Corp. (NYSE: MFC US)', 'TAL Education: A Real Business With Fake Financials Part IV (NYSE: TAL US)', 'TAL Education: A Real Business With Fake Financials Part III (NYSE: TAL US)', 'TAL Education: A Real Business With Fake Financials Part II (NYSE: TAL US)', 'MW is Short TAL Education Group (NYSE: TAL US)', 'MW is Short IQE (AIM: IQE LN)', 'OSI Systems: Lost Credibility', 'MW is Short China Internet Financial Services (NASDAQ: CIFS)', 'MW is Short OSI Systems (NASDAQ: OSIS)', 'MW is Short Prothena Corp PLC (PRTA:US)', 'MW Response to Man Wah (1999:HK)', 'Asanko: Puppies and Rainbows (AKG:CN)', 'Muddy Waters is Short Asanko Gold Inc. (AKG:CN)', 'MW Statement on STJ/ABT Acknowledgement of Cyber Vulnerabilities', 'China Huishan Part 2: Tax Bureau Evidence of Topline Fraud', 'MW is Short China Huishan Dairy Holdings Co Ltd (6863:HK)', 'MW is Short Nidec Corp. (6594:JP)', 'STJ: Still Non-Secure', 'MW is Short St. Jude Medical (STJ:US)', 'Ströer: Astounding AGM Transcript', 'Ströer: the Pile Gets Bigger (SAX:GR)', 'Muddy Waters is Short Ströer (SAX:GR)', 'New Information on Casino Casts Additional Doubt on France Recovery; Governance Problems in Brazil Appear Larger than Admitted (CO:FP)', 'Everything You Always Wanted to Know About Casino (But Were Afraid to Ask) (CO:FP)', 'Muddy Waters is Short Groupe Casino (CO:FP)', 'Muddy Waters is Short TeliaSonera (TLSN.ST)', 'Muddy Waters is Short Noble Group Ltd (NOBL:SP)', 'Bolloré (BOL FP): Complexity Creating Arbitrage with over 95% Upside', 'Muddy Waters is Short Superb Summit (1228.HK)', 'NQ: Oops', 'NQ Admits Widespread Data Tampering, but Claims it is not a Fraud', 'NQ: You Can’t Fool All of the People All of the Time', 'MW Offer to NQ’s Independent Committee', 'NQ’s US Veneer: Withholding Facts, Conned Men, and a Convicted Racketeer', 'If You Believe in Yidatong, You’ll Believe in Santa Claus', 'Chinese Media Views on NQ', 'NQ’s Top Ten Lies Since Friday', 'Initiating Coverage on NQ Mobile Inc. (NYSE: NQ) – Strong Sell', 'AMT’s / GTP: We’re More Skeptical than Ever', 'Olam: Not Changing the Old Ways', 'A Zebra Can’t Change its Stripes: AMT’s Latest Purchase from NIHD', 'Cracking Façade: AMT’s Q2 2013 Results', 'Initiating Coverage on AMT – Strong Sell (Slide Deck)', 'Initiating Coverage on AMT – Strong Sell (Full Report)', 'Muddy Waters’s Reaction to Olam Q2 2013 Report', 'MW Learned from John Hempton how to Catch FMCN in Another Lie', 'Stop-Gap Bailout of Olam Validates MW’s Thesis', 'Muddy Waters Offers to Pay for Olam Debt Rating', 'Initiating Coverage on Olam International – Strong Sell', 'Muddy Waters Reaction to Olam Frantic Response', 'Initiating Coverage on EDU – Strong Sell', 'Frauducation White Paper', 'Frauducation Supporting Documentation', 'Part 1 of MW’s “Frauducation” Series: Fraud School, RINO, FSIN', 'FMCN: MW was Right that FMCN Lied re LCD Network; “Verification” Counted 30,500 Cardboard Posters', 'FMCN: Why Would FMCN Buy a Ginseng Plantation? From One of its Employees?', 'FMCN: Is “Independent” Verification in China Better Than Toilet Paper?', 'MW Reiterating Strong Sell on FMCN', 'Muddy Waters Initiating Coverage on FMCN – Strong Sell', 'Open Letter to SPRD Chairman re: MW Concerns', 'TRE.TO, SNOFF.PK – The Ties that Blind – Part 1', 'Muddy Waters Reaction to TRE Q1 2011 Conference Call', 'TRE.TO, SNOFF.PK – Supporting Documentation (Appendices E-K)', 'TRE.TO, SNOFF.PK – Supporting Documentation (Appendix D)', 'TRE.TO, SNOFF.PK – Supporting Documentation (Appendices A-C)', 'Muddy Waters Initiating Coverage on TRE.TO, OTC:SNOFF – Strong Sell', 'Muddy Waters Initiating Coverage on DGW – Strong Sell', 'CCME: Irrefutable Evidence of Fraud', 'Muddy Waters Initiating Coverage on CCME – Strong Sell', 'Muddy Waters initiating coverage on RINO – Strong Sell', 'One Billion Chinese People Can’t Be Wrong…', 'Chinese Newspaper Investigates ONP, p.2', 'Chinese Newspaper Investigates ONP, p. 1', 'Oh What a Tangled Web We Weave…', 'Two Points RE: the ONP debate', 'MW Response to Rick Pearson', 'MW Confirms It Analyzed the Correct ONP Entity', 'Muddy Waters Initiating Coverage on ONP – Strong Sell']
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs
import requests

headers={
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
}
def scrape(url):
    with requests.Session() as req:
        req.headers.update(headers)
        r = req.get(url).text
        soup = bs(r, 'lxml')
        info = soup.find_all('td', {'class': 'first'})
        res = [[b.text, b.a['href']] for b in info]
   
        print(res)


url =  'https://www.muddywatersresearch.com/research/'
scrape(url)

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM