简体   繁体   中英

Scraping a specific website

I'm working on a Python project that includes trying to scrape historical sports odds/results from the site oddsportal.com, for example at the exact URL http://www.oddsportal.com/soccer/england/premier-league/results/

The problem is that the actual odds are not embedded in the HTML but obscured by gigantic Javascript. I am aware that a possible approach is to use a headless web driver that can interpret javascript and to pull the data that way, but the website loads tons of extraneous stuff that makes this approach not efficient, so I would appreciate some help in reverse engineering the source of the values.

Some info:

The above HTML loads an extremely large globals-....js file as well as other js and css files (that don't seem to be too relevant) from rb.oddsportal.com and fb.oddsportal.com. In addition there is small communication with weblog.livesport.eu at port 2222 which seems suspicious

The general idea seems to be that the html includes for each match in the table an "xeid" value, and for each participant in addition "xoid" and "xodd" values, that are somehow manipulated by the javascript into the plaintext values.

Any help is appreciated

I have worked with this site some time ago so here is my Perl code (as you can see all magic is located in " http://www.oddsportal.com/feed/postmatch/1-1- " . $match->{id} . "-1321390800-1-2.dat" file):

#!/usr/bin/perl

use Modern::Perl;
use HTML::TreeBuilder::XPath;
use WWW::Mechanize;
use FindBin qw($Bin);
use Getopt::Long;
use DateTime;
use DateTime::Format::Strptime;
use Date::Range;
use Date::Simple;
use JSON::PP;

my $config;

my $result = GetOptions(
    "date=s"   => \$config->{date},
    "league=s" => \$config->{league_id},
    "output"   => \$config->{output_format}
);

( $config->{start_date}, $config->{end_date} ) = split /-/, $config->{date};

unless ( $config->{end_date} ) {

    $config->{end_date} = $config->{start_date};
}

$config->{start_date} = format_date( $config->{start_date} );
$config->{end_date}   = format_date( $config->{end_date} );

my $leagues = {

    1 => {
        title => "English Premier League",
        url =>
          "http://www.oddsportal.com/soccer/england/premier-league/results/"
    },
    2 => {
        title => "Primera Division",
        url =>
          "http://www.oddsportal.com/soccer/spain/primera-division/results/"
    },
    3 => {
        title => "Bundesliga",
        url   => "http://www.oddsportal.com/soccer/germany/bundesliga/results/"
    },
    4 => {
        title => "Ligue 1",
        url   => "http://www.oddsportal.com/soccer/france/ligue-1/results/",
    },
    5 => {
        title => "Serie A",
        url   => "http://www.oddsportal.com/soccer/italy/serie-a/results/",
    },
    6 => {
        title => "Champs League",
        url =>
          "http://www.oddsportal.com/soccer/europe/champions-league/results/",
    },
    7 => {
        title => "Europa League",
        url => "http://www.oddsportal.com/soccer/europe/europa-league/results/",
    },
};

say $leagues->{ $config->{league_id} }->{title};

my $mech = WWW::Mechanize->new();
$mech->agent_alias("Windows IE 6");

$mech->get( $leagues->{ $config->{league_id} }->{url} );

my @matches = find_matches( $mech, $config->{start_date}, $config->{end_date} );

foreach my $match (@matches) {

    collect_info($match);
    save_info($match);
}

sleep 1;

sub collect_info {

    my ($match) = shift;

    my $mech = WWW::Mechanize->new();
    $mech->agent_alias("Windows IE 6");

    say "\t\t", "[$match->{match_date}] $match->{title}";

    #$mech->get( $match->{url} );

    parse_match( $match, $mech );
    sleep 1;
}

#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-1-2.dat 1X2
#http://www.oddsportal.com/feed/postmatch/1-1-1382641-1321390800-1-2.dat

#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-5-2.dat AH
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat OU
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat

#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-6-2.dat DNB
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-12-2.dat EH
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-4-2.dat DC

sub parse_match {

    my ( $match, $mech ) = @_;

    parse_1x2( $match, $mech );
    parse_ou( $match, $mech );

    $mech->save_content("1x2.dat");

    sleep 1;
}

sub parse_ou {

    my ( $match, $mech ) = @_;
    $mech->get( "http://www.oddsportal.com/feed/postmatch/1-1-"
          . $match->{id}
          . "-1321390800-2-2.dat" );

    $mech->save_content("ou.dat");

    my $json = $mech->content();
    $json =~ s/^-\|-|-\|-$//sg;

    my $data = decode_json $json;

    #1.5
    (
        $match->{"pinnacle_over_1.5_price"},
        $match->{"pinnacle_under_1.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{18}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{18} }
      {qw(0 1)}
      : ( "", "" );

    ( $match->{"betfair_over_1.5_price"}, $match->{"betfair_under_1.5_price"} )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    (
        $match->{"betfair_lay_over_1.5_price"},
        $match->{"betfair_lay_under_1.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    (
        $match->{average_home_price}, $match->{average_draw_price},
        $match->{average_away_price}, $match->{highest_home},
        $match->{highest_draw},       $match->{highest_away}
      )
      = find_averages_1x2(
        $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds},
        [
            qw(

              14
              3
              16
              76
              2
              147
              28
              41
              33
              60
              18
              75
              101
              15
              )
        ]
      );

    #2.5

    (
        $match->{"pinnacle_over_2.5_price"},
        $match->{"pinnacle_under_2.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{18}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{18} }
      {qw(0 1)}
      : ( "", "" );

    ( $match->{"betfair_over_2.5_price"}, $match->{"betfair_under_2.5_price"} )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    (
        $match->{"pinnacle_over_3.5_price"},
        $match->{"pinnacle_under_3.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{18}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{18} }
      {qw(0 1)}
      : ( "", "" );

    ( $match->{"betfair_over_3.5_price"}, $match->{"betfair_under_3.5_price"} )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    sleep 1;
}

sub parse_1x2 {

    my ( $match, $mech ) = @_;
    $mech->get( "http://www.oddsportal.com/feed/postmatch/1-1-"
          . $match->{id}
          . "-1321390800-1-2.dat" );

    my $json = $mech->content();
    $json =~ s/^-\|-|-\|-$//sg;

    my $data = decode_json $json;

    (
        $match->{pinnacle_home_price},
        $match->{pinnacle_draw_price},
        $match->{pinnacle_away_price}
      )
      = @{ $data->{d}->{oddsdata}->{back}->{"E-1-2-0-0-0"}->{odds}->{18} }
      {qw(0 1 2)};

    (
        $match->{average_home_price}, $match->{average_draw_price},
        $match->{average_away_price}, $match->{highest_home},
        $match->{highest_draw},       $match->{highest_away}
      )
      = find_averages_1x2(
        $data->{d}->{oddsdata}->{back}->{"E-1-2-0-0-0"}->{odds},
        [
            qw(

              14
              3
              16
              76
              2
              147
              28
              41
              33
              60
              18
              75
              101
              15
              )
        ]
      );
}

sub find_averages_ou {

    my ( $bookmakers, $ids ) = @_;
    my ( $avg_home,     $avg_draw,     $avg_away );
    my ( $highest_home, $highest_draw, $highest_away );

    my ( $sum_home, $sum_draw, $sum_away, $counter );

    foreach my $id ( @{$ids} ) {

        $sum_home += $bookmakers->{$id}->{0};
        $sum_draw += $bookmakers->{$id}->{1};
        $sum_away += $bookmakers->{$id}->{2};

        unless ( defined $highest_home ) {

            $highest_home = $bookmakers->{$id}->{0};
            $highest_draw = $bookmakers->{$id}->{1};
            $highest_away = $bookmakers->{$id}->{2};
        }
        else {

            if ( $highest_home < $bookmakers->{$id}->{0} ) {

                $highest_home = $bookmakers->{$id}->{0};
            }

            if ( $highest_draw < $bookmakers->{$id}->{1} ) {

                $highest_draw = $bookmakers->{$id}->{1};
            }

            if ( $highest_away < $bookmakers->{$id}->{2} ) {

                $highest_away = $bookmakers->{$id}->{2};
            }
        }

        $counter++;
    }

    $avg_home = $sum_home / $counter;
    $avg_draw = $sum_draw / $counter;
    $avg_away = $sum_away / $counter;

    return (
        sprintf( "%0.2f", $avg_home ),
        sprintf( "%0.2f", $avg_draw ),
        sprintf( "%0.2f", $avg_away ),
        $highest_home, $highest_draw, $highest_away
    );
}


sub find_averages_1x2 {

    my ( $bookmakers, $ids ) = @_;
    my ( $avg_home,     $avg_draw,     $avg_away );
    my ( $highest_home, $highest_draw, $highest_away );

    my ( $sum_home, $sum_draw, $sum_away, $counter );

    foreach my $id ( @{$ids} ) {

        $sum_home += $bookmakers->{$id}->{0};
        $sum_draw += $bookmakers->{$id}->{1};
        $sum_away += $bookmakers->{$id}->{2};

        unless ( defined $highest_home ) {

            $highest_home = $bookmakers->{$id}->{0};
            $highest_draw = $bookmakers->{$id}->{1};
            $highest_away = $bookmakers->{$id}->{2};
        }
        else {

            if ( $highest_home < $bookmakers->{$id}->{0} ) {

                $highest_home = $bookmakers->{$id}->{0};
            }

            if ( $highest_draw < $bookmakers->{$id}->{1} ) {

                $highest_draw = $bookmakers->{$id}->{1};
            }

            if ( $highest_away < $bookmakers->{$id}->{2} ) {

                $highest_away = $bookmakers->{$id}->{2};
            }
        }

        $counter++;
    }

    $avg_home = $sum_home / $counter;
    $avg_draw = $sum_draw / $counter;
    $avg_away = $sum_away / $counter;

    return (
        sprintf( "%0.2f", $avg_home ),
        sprintf( "%0.2f", $avg_draw ),
        sprintf( "%0.2f", $avg_away ),
        $highest_home, $highest_draw, $highest_away
    );
}

sub format_date {

    my ($date) = shift;

    my ( $day, $month, $year ) = $date =~ m{(\d{2})(\d{2})(\d{4})};

    $date = join( "-", $year, $month, $day );

    return $date;
}

sub find_matches {

    my ( $mech, $start_date, $end_date ) = @_;
    my @matches;

    my ( $year, $day, $month ) = split /-/, $start_date;

    my $season;
    if ( $start_date lt $year . "-08-01" ) {

        $season->{title} = join( "/", $year - 1, $year );
    }
    else {

        $season->{title} = join( "/", $year, $year + 1 );
    }

    say "\t", $season->{title};
    $mech->follow_link( text => $season->{title} );

    process_season( $mech, $season );

    my $range = Date::Range->new( Date::Simple->new($start_date),
        Date::Simple->new($end_date) );

    foreach my $date ( $range->dates ) {

        if ( exists $season->{matches}->{$date} ) {

            push @matches, @{ $season->{matches}->{$date} };
        }
    }

    return @matches;
}

sub process_season {

    my ( $mech, $season ) = @_;

    say "\t\tCollecting season info...";

  PARSE_RESULT_PAGE:
    my $tree = HTML::TreeBuilder::XPath->new_from_content( $mech->content() );

    my ($current_page) = $tree->findvalues('//span[@class = "active-page"]');
    my ($last_page_url) =
      $tree->findvalues('//div[ @id = "pagination"]/a[ last() ]/@href');
    my ($next_page_url) =
      $tree->findvalues('//div[ @id = "pagination"]/a[ last() -1 ]/@href');
    my ($last_page) = $last_page_url =~ m{/(\d+)/$};

    my $match_day;
    foreach my $row (
        $tree->findnodes('//table[ @id = "tournamentTable" ]/tbody/tr') )
    {

        next
          unless ( ( $row->attr('class') eq "center nob-border" )
            || ( $row->attr('class') =~ m/deactivate/ ) );

        if ( $row->attr('class') eq "center nob-border" ) {

            my ($match_day_string) = $row->findvalues('./th[1]/span');
            $match_day = convert_match_day($match_day_string);
        }
        else {

            my $match = get_match_info($row);
            $match->{match_date} = $match_day;

            ( $match->{home}, $match->{away} ) = split /\s+-\s+/,
              $match->{title};
            ( $match->{home_team_goals}, $match->{away_team_goals} ) =
              split /:/, $match->{score};

            if ($match_day) {

                push @{ $season->{matches}->{$match_day} }, $match;
            }
        }
    }

    while ( $current_page != $last_page ) {

        $mech->get( "http://www.oddsportal.com" . $next_page_url );
        goto PARSE_RESULT_PAGE;
    }

    $tree->delete();
    say "\t\tDone!";
}

sub get_match_info {

    my ($row) = shift;
    my $match;

    ( $match->{start_time} ) = $row->findvalues('./td[1]');
    ( $match->{url} )        = $row->findvalues('./td[2]/a/@href');
    $match->{url} = "http://www.oddsportal.com" . $match->{url};

    ( $match->{id} ) = $match->{url} =~ m{(\d+)/$};

    ( $match->{title} ) = $row->findvalues('./td[2]/a');
    ( $match->{score} ) = $row->findvalues('./td[3]');

    return $match;
}

sub convert_match_day {

    my ($text_date) = shift;

    $text_date =~ s/\s+/ /g;

    my $strp = DateTime::Format::Strptime->new(
        pattern  => '%d %B %Y',
        locale   => 'en_US',
        on_error => 'croak',
    );

    my $dt = $strp->parse_datetime($text_date);

    if ($dt) {

        return $dt->ymd();
    }
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM