简体   繁体   中英

Retrieving Lyrics from Musixmatch

import requests
import json
import urllib
import lyricsgenius
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


from bs4 import BeautifulSoup


from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.dbsparta

def get_artist_id(artistName):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    response = requests.get("https://api.musixmatch.com/ws/1.1/artist.search?page_size=100&format=json&apikey=123&q_artist=" + artistName, headers=headers)
    response.encoding = 'UTF-8'
    return response.json()['message']['body']['artist_list'][0]['artist']['artist_id']
    # print(response.json()['message']['body']['artist_list'][0]['artist']['artist_id'])



def get_album_ids(artist_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    album_response = requests.get("https://api.musixmatch.com/ws/1.1/artist.albums.get?page_size=100&format=json&apikey=123&artist_id=" + str(artist_id), headers=headers)
    album_response.encoding = 'UTF-8'
    # counter = 0
    # album_list = album_response.json()['message']['body']['album_list']
    return album_response.json()['message']['body']['album_list']
    # print(album_response.json()['message']['body']['album_list'])


    # for album in album_list:
    #     # counter += 1
    #     print(album['album']['album_id'])

def get_album_tracks_ids(album_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    response = requests.get("https://api.musixmatch.com/ws/1.1/album.tracks.get?page_size=100&format=json&apikey=123&album_id=" + str(album_id), headers=headers)
    response.encoding = 'UTF-8'
    return response.json()['message']['body']['track_list']


# def get_track_id(artist_id):
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
#     response = requests.get("https://api.musixmatch.com/ws/1.1/track.search?page_size=100format=json&apikey=123&f_artist_id=" + str(artist_id), headers=headers)
#     response.encoding = 'UTF-8'
#     for tracks in response.json()['message']['body']['track_list']:
#         print(tracks['track']['track_name'])

def get_track_lyrics(track_id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    response = requests.get("https://api.musixmatch.com/ws/1.1/track.lyrics.get?apikey=123&track_id=" + str(track_id), headers=headers)
    response.encoding = 'UTF-8'
    # return response['message']['body']['lyrics']['lyrics_body']
    return response.json()['message']['body']['lyrics']['lyrics_body']



def main():
    stars_list = list(db.new_top200.find({}, {'_id': 0}))
    for stars in stars_list:
        print(stars['name'])
        album_ids = get_album_ids(get_artist_id(stars['name']))
        # if album_ids is not None:
        for album_id in album_ids:
            # if album_id is not None and get_album_tracks_ids(album_id['album']['album_id']) is not [] and get_album_tracks_ids(album_id['album']['album_id']) is not None:
            track_ids = get_album_tracks_ids(album_id['album']['album_id'])
            for track in track_ids:
                # if track is not [] and track['track']['track_id'] is not [] and track is not None:
                #     if get_track_lyrics(track['track']['track_id']) is not [] and get_track_lyrics(track['track']['track_id']) is not None:
                lyric = get_track_lyrics(track['track']['track_id'])
                db.new_top200.update_one({'name': stars['name']},{'$push': {'lyrics': lyric } })


# get_track_id(get_artist_id('Kanye West'))

# get_album_ids(get_artist_id("Kanye West"))
# get_album_tracks(15565713)


if __name__ == "__main__":
    # for album in get_album_ids(get_artist_id("Kanye West")):
    #     get_album_tracks_ids(album['album']['album_id'])
    # get_track_lyrics(96610952)
    # get_album_tracks_ids(15565713)
    # get_album_ids(get_artist_id('Drake'))
    main()



I'm trying to get ALL of the lyrics of an artist and store it in a database. For example, if the artist is "Drake" I want all of the lyrics stored in the 'lyrics' key in my database.

However, I get a bunch of unpredictable errors every time I run the same code. For example, it would be inserting 400 lyrics without any problem and suddenly I'll get an error saying that 'list indices must be integers or slices not str'. This error is quite confusing to me because I'm assuming that all of the json data are in the same format and I have a sudden error after processing 400 song lyrics with no problem enter image description here

I can run the same code and at about 200 song lyrics in, I'll get a json decode error and then when I can run it AGAIN and after processing a different amount of song lyrics I'll get the error I described in the beginning again.

Can someone explain the random nature of this error?

Thank you!

You are making assumptions about the data types that will be returned from the JSON. In your case I suspect that one of the json elements is a list not an object.

Your issue can be reproduced with this simple example:

my_dict = {
    'message': {
        'body': {
            'lyrics': ['Always look on the bright side of life']
        }
    }
}

print(my_dict['message']['body']['lyrics']['lyrics_body'])

gives:

TypeError: list indices must be integers or slices, not str

How do you fix it? You'll need to check each element matches what you expect; for example:

my_dict = {
    'message': {
        'body': {
            'lyrics': ['Always look on the bright side of life']
        }
    }
}

def checker(item, field):
    if isinstance(item, dict):
        return item.get(field)
    else:
        raise ValueError(f"'{item}' in field '{field}' is not a valid dict")


message = checker(my_dict, 'message')
body = checker(message, 'body')
lyrics = checker(body, 'lyrics')
print(checker(lyrics, 'lyrics'))

gives:

ValueError: '['Always look on the bright side of life']' in field 'lyrics' is not a valid dict

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM