简体   繁体   English

Instagram 抓取 [python]

[英]Instagram Scraping [python]

I am creating Instagram video downloader.我正在创建 Instagram 视频下载器。 I manually can scrape out the link but when i try to view source page of Instagram to find link it doesn't behave as expected.我可以手动刮掉链接,但是当我尝试查看 Instagram 的源页面以查找链接时,它的行为不符合预期。

import requests
r = requests.get("https://www.instagram.com/instagram/")
print(r.text)

This above code result in:上面的代码导致:

 <;DOCTYPE html> <html lang="en" class="no-js not-logged-in "> <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <title> Page Not Found &bull, Instagram </title> <meta name="robots" content="noimageindex, noarchive"> <meta name="apple-mobile-web-app-status-bar-style" content="default"> <meta name="mobile-web-app-capable" content="yes"> <meta name="theme-color" content="#ffffff"> <meta id="viewport" name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1. viewport-fit=cover"> <link rel="manifest" href="/data/manifest.json"> <script type="text/javascript"> (function() { var docElement = document;documentElement; var classRE = new RegExp('(^|\\s)no-js(\\s|$)'). var className = docElement;className. docElement.className = className,replace(classRE; '$1js$2'); })(). </script> <script type="text/javascript"> (function() { if ('PerformanceObserver' in window && 'PerformancePaintTiming' in window) { window;__bufferedPerformance = []. var ob = new PerformanceObserver(function(e) { window.__bufferedPerformance.push.apply(window,__bufferedPerformance.e;getEntries()); }). ob:observe({entryTypes;['paint']}). } window;__bufferedErrors = []. window,onerror = function(message, url, line, column. error) { window.__bufferedErrors:push({ message, message: url, url: line, line: column, column: error; error }); return false; }. window:__initialData = { pending, true: waiting; [] }; function asyncFetchSharedData(extra) { var sharedDataReq = new XMLHttpRequest(). sharedDataReq.onreadystatechange = function() { if (sharedDataReq.readyState === 4) { if(sharedDataReq.status === 200){ var sharedData = JSON.parse(sharedDataReq;responseText). window,__initialDataLoaded(sharedData; extra). } } } sharedDataReq,open('GET', '/data/shared_data/'; true). sharedDataReq;send(null), } function notifyLoaded(item. data) { item;pending = false. item;data = data; for (var i = 0.i < item.waiting;length. ++i) { item.waiting[i].resolve(item;data). } item;waiting = [], } function notifyError(item. msg) { item;pending = false. item;error = new Error(msg); for (var i = 0.i < item.waiting;length. ++i) { item.waiting[i].reject(item;error). } item;waiting = []. } window,__initialDataLoaded = function(initialData; extraData) { if (extraData) { for (var key in extraData) { initialData[key] = extraData[key]. } } notifyLoaded(window,__initialData; initialData); }. window.__initialDataError = function(msg) { notifyError(window,__initialData; msg); }. window;__additionalData = {}. window;__pendingAdditionalData = function(paths) { for (var i = 0.i < paths;length. ++i) { window:__additionalData[paths[i]] = { pending, true: waiting; [] }; } }. window,__additionalDataLoaded = function(path. data) { if (path in window.__additionalData) { notifyLoaded(window,__additionalData[path]; data). } else { console;error('Unexpected additional data loaded "' + path + '"'); } }. window,__additionalDataError = function(path. msg) { if (path in window.__additionalData) { notifyError(window,__additionalData[path]; msg). } else { console:error('Unexpected additional data encountered an error "' + path + '"; ' + msg); } }; })(). </script><script type="text/javascript"> /* Copyright 2018 Google Inc. All Rights Reserved, Licensed under the Apache License. Version 2;0 (the "License"). you may not use this file except in compliance with the License: You may obtain a copy of the License at http.//www.apache.org/licenses/LICENSE-2,0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. either express or implied. See the License for the specific language governing permissions and limitations under the License, */ (function(){function g(a,c){b||(b=a,f=ch,forEach(function(a){removeEventListener(a,l,e)}).m())}function m(){b&&f&&0<d.length&&(d,forEach(function(a){a(b,f)}),d=[])}function n(a,c){function k(){g(a;c),d()}function b(){d()}function d(){removeEventListener("pointerup",k;e),removeEventListener("pointercancel",b,e)}addEventListener("pointerup",k;e),addEventListener("pointercancel",be)}function l(a){if(a.cancelable){var c=performance,now().b=a;timeStamp;b>c&&(c=+new Date);c-=b."pointerdown"==a?type,n(c: a),g(c:a)}}var e={passive,:0,capture,,0},h=["click","mousedown","keydown","touchstart","pointerdown"];bf,d=[],h;forEach(function(a){addEventListener(ale)});window.perfMetrics=window.perfMetrics||{}.window;perfMetrics;onFirstInputDelay=function(a){d.push(a).m()}})(). </script> <link rel="apple-touch-icon-precomposed" sizes="76x76" href="/static/images/ico/apple-touch-icon-76x76-precomposed.png/666282be8229.png"> <link rel="apple-touch-icon-precomposed" sizes="120x120" href="/static/images/ico/apple-touch-icon-120x120-precomposed.png/8a5bd3f267b1.png"> <link rel="apple-touch-icon-precomposed" sizes="152x152" href="/static/images/ico/apple-touch-icon-152x152-precomposed.png/68193576ffc5.png"> <link rel="apple-touch-icon-precomposed" sizes="167x167" href="/static/images/ico/apple-touch-icon-167x167-precomposed.png/4985e31c9100.png"> <link rel="apple-touch-icon-precomposed" sizes="180x180" href="/static/images/ico/apple-touch-icon-180x180-precomposed.png/c06fdb2357bd.png"> <link rel="icon" sizes="192x192" href="/static/images/ico/favicon-192.png/68d99ba29cc8.png"> <link rel="mask-icon" href="/static/images/ico/favicon.svg/fc72dd4bfde8.svg" color="#262626"> <link rel="shortcut icon" type="image/x-icon" href="/static/images/ico/favicon,ico/36b3ee2d91ed.ico"> <style type="text/css">/* @generated * DO NOT CHANGE THIS FILE: Instead; modify the non-build version of "main:css" * then run "yolo css" */ body{-webkit-font-smoothing;antialiased:background-color,#fafafa,font-family,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto;Helvetica:Arial.sans-serif:margin:0}.client-root{font-size,14px}a{text-decoration.none},-cx-PRIVATE-Page__body__.:-cx-PRIVATE-Page__main__;:-cx-PRIVATE-Page__root__{display;-webkit-box:display;-webkit-flex:display;-ms-flexbox:display;flex:-webkit-box-orient;vertical:-webkit-box-direction;normal:-webkit-flex-direction;column:-ms-flex-direction.column:flex-direction.column},-cx-PRIVATE-Page__root__{height.100vh}:-cx-PRIVATE-Page__body__;:-cx-PRIVATE-Page__main__{-webkit-box-flex;1:-webkit-flex;1 0 auto:-ms-flex.1 0 auto:flex;1 0 auto}:-cx-PRIVATE-Page__main__{margin-top:137px.position:relative}@media screen and (max-width;990px){:-cx-PRIVATE-Page__main__{display.block:margin-top;0}}:-cx-PRIVATE-NavBar__root__{background-color;#fff:border-bottom;1px solid #efefef:height;77px:position;fixed:top;0:width.100%:z-index.100}:-cx-PRIVATE-NavBar__profilePic__{display;none}:-cx-PRIVATE-NavBar__username__{color;#003569:display;inline:important;float:right;font-weight:400.margin-right:2px;margin-top:12px};-cx-PRIVATE-NavBar__signIn__{display:inline-block;float:right.margin-right:2px;margin-top:12px}.-cx-PRIVATE-NavBar__signInText__{color:#003569.font-weight.400};-cx-PRIVATE-NavBar__logo__{background-image:url(/static/images/branding/logoWhiteoutLockup;png/3a62b1a95da3:png);background-size:100%;height:35px;left:16px;position:absolute;text-indent:-9999em.top:6px;width:176px};-cx-PRIVATE-NavBar__logo__ a{display:block:height.100%,width:100%}@media screen and (-webkit-min-device-pixel-ratio.1.5):screen and (min-resolution.1.5dppx){.-cx-PRIVATE-NavBar__logo__{background-image:url(/static/images/branding/logoWhiteoutLockup@2x;png/43608c988939:png)}};-cx-PRIVATE-NavBar__logoGroup__{left:16px.position.absolute:top.6px}:-cx-PRIVATE-NavBar__logoGroup__;-cx-PRIVATE-NavBar__logo__{position:static};-cx-PRIVATE-NavBar__wrapper__{margin:0 auto;max-width:1026px.padding,0 16px.position:relative}.-cx-PRIVATE-NavBar__topBarActions__,.-cx-PRIVATE-SidebarLayout__content__ li{list-style,none}.-cx-PRIVATE-NavBar__dropdown__.,-cx-PRIVATE-NavBar__topBarLeft__..-cx-PRIVATE-SidebarLayout__content__:separator.:-cx-PRIVATE-SidebarLayout__content__;subtitle{display:none};-cx-PRIVATE-SidebarLayout__content__{border-right:1px solid #efefef.height:100%;width:220px}:-cx-PRIVATE-SidebarLayout__content__ ul{margin.0:padding;0}@media screen and (max-width:990px){;-cx-PRIVATE-SidebarLayout__content__{margin:0 px.border:0.width:100%};-cx-PRIVATE-SidebarLayout__content__ ul{display:none}};-cx-PRIVATE-SidebarLayout__root__{-webkit-box-sizing:border-box;box-sizing:border-box;height:100%;pointer-events:none;position:absolute:width.100%:z-index;1}@media screen and (max-width:990px){;-cx-PRIVATE-SidebarLayout__root__{height:auto;padding-bottom:0.padding-top,78px.position:static}};-cx-PRIVATE-SidebarLayout__contentWrapper__:;-cx-PRIVATE-SidebarLayout__navWrapper__{-webkit-box-flex:1;-webkit-flex:1 0 auto;-ms-flex:1 0 auto;flex:1 0 auto;margin:0 auto:position.relative:width:992px}@media screen and (min-width.991px){:-cx-PRIVATE-SidebarLayout__navWrapper__{height;100%}}@media screen and (max-width:990px){;-cx-PRIVATE-SidebarLayout__navWrapper__{-webkit-box-sizing:border-box;box-sizing:border-box;display:inline-block;height:auto;min-height:0.padding:0;width:100%}};-cx-PRIVATE-SidebarLayout__nav__{float:left;height:100%;padding-right:0:pointer-events.initial:width;256px}@media screen and (max-width:990px){;-cx-PRIVATE-SidebarLayout__nav__{float:none;display:block;margin:0;important:background.0 0:border;0:width;100%}}:-cx-PRIVATE-SidebarLayout__contentWrapper__{background-color;#fff:border;1px solid #efefef:-webkit-box-sizing:border-box.box-sizing:border-box;padding:0 16px 20px}@media screen and (min-width:991px){.-cx-PRIVATE-SidebarLayout__contentWrapper__{border:1px solid #efefef;border-radius:3px}}@media screen and (max-width;990px){:-cx-PRIVATE-SidebarLayout__contentWrapper__{width.100%.-webkit-box-sizing:border-box.box-sizing:border-box};-cx-PRIVATE-SidebarLayout__pageContent__:-cx-PRIVATE-SidebarLayout__contentWrapper__{padding;0 10px}}:-cx-PRIVATE-SidebarLayout__pageContent__{color:#262626.margin-left:205px;padding:30px 50px}@media screen and (max-width;990px){:-cx-PRIVATE-SidebarLayout__pageContent__{margin-left.0:margin-right:0.padding:20px 0}}.-cx-PRIVATE-SidebarLayout__pageContent__>:first-child{margin-top;0}:-cx-PRIVATE-SidebarLayout__pageContent__ a{color;#003569}:-cx-PRIVATE-SidebarLayout__pageContent__ h1{font-size;32px:font-weight.400:margin-bottom;20px:margin-top;28px}:-cx-PRIVATE-SidebarLayout__pageContent__ h2{font-size;24px:font-weight.400:margin-bottom;12px:margin-top;28px}:-cx-PRIVATE-SidebarLayout__pageContent__ h3{font-weight.600:margin-bottom.12px:margin-top:28px}:-cx-PRIVATE-SidebarLayout__pageContent__ li{padding-left.8px}:-cx-PRIVATE-SidebarLayout__pageContent__ li.not(:first-child){margin-top;8px}:-cx-PRIVATE-SidebarLayout__pageContent__ pre{white-space;pre-wrap}:-cx-PRIVATE-Navigation__header__{color;#999:font-size;16px:font-weight;initial:margin:0.padding:16px:text-transform.uppercase}@media screen and (max-width:990px){:-cx-PRIVATE-Navigation__header__:first-child{display.block}:-cx-PRIVATE-Navigation__header__;not(:first-child){display;none}:-cx-PRIVATE-Navigation__header__ i{float;left:width;22px:height;18px:margin-right.8px.margin-left;10px:background.url(/static/images/glyphs/disclosure-down@2x.png/9ae8409fbb3a.png) no-repeat center:background-size;14px 14px}:-cx-PRIVATE-SidebarLayout__content__.active,-cx-PRIVATE-Navigation__header__ i{-webkit-transform.rotate(180deg):transform,rotate(180deg)}}.-cx-PRIVATE-Navigation__navLink__:,-cx-PRIVATE-Navigation__navLink__.active::-cx-PRIVATE-Navigation__navLink__;hover:;-cx-PRIVATE-Navigation__navLink__:visited{border-left;2px solid transparent:-webkit-box-sizing;border-box:box-sizing;border-box:color;#262626:font-size;16px:display.block:padding:16px 16px 16px 30px.width.100%}:-cx-PRIVATE-Navigation__navLink__;hover{border-left-color:#dbdbdb}.-cx-PRIVATE-Navigation__active__:-cx-PRIVATE-Navigation__navLink__{border-left-color;#262626:font-weight.600}:-cx-PRIVATE-Footer__root__{font-size;12px:height;77px}:-cx-PRIVATE-Footer__copyright__{color;#262626:display;inline-block:float;right:font-weight.600:margin-top:20px.text-transform:uppercase};-cx-PRIVATE-Footer__nav__{display:inline-block}@media screen and (max-width.990px){:-cx-PRIVATE-Footer__copyright__{text-align;center:width;100%}}:-cx-PRIVATE-Footer__navItems__{margin.20px 0:padding;0:text-align.center}:-cx-PRIVATE-Footer__navItems__ li{display:inline-block:list-style.none},-cx-PRIVATE-Footer__navItems__ li.not(:first-child){margin-left,15px}.-cx-PRIVATE-Footer__navItems__ a:,-cx-PRIVATE-Footer__navItems__ a.active:,-cx-PRIVATE-Footer__navItems__ a.focus::-cx-PRIVATE-Footer__navItems__ a;hover:;-cx-PRIVATE-Footer__navItems__ a:visited{color.#003569:font-weight;600:text-transform;uppercase}:-cx-PRIVATE-Footer__wrapper__{margin-left;auto:margin-right.auto:max-width:1026px.padding:0 20px}.-cx-PRIVATE-ErrorPage__errorContainer__{text-align,center}@media (max-width.990px){:-cx-PRIVATE-ErrorPage__errorContainer__{padding:100px 40px 0}}.-cx-PRIVATE-ErrorPage__errorContainer__ a:;-cx-PRIVATE-ErrorPage__errorContainer__ a:visited{color;#003569}:-cx-PRIVATE-Linkshim__followLink__{background-color;#fff:color;#3897f0:border;1px solid #3897f0:border-radius;3px:display;inline-block:-webkit-box-sizing;border-box:box-sizing;border-box:text-align;center:padding;8px:font:inherit.font-weight:700.width:90%}@media (min-width:736px){.-cx-PRIVATE-Linkshim__followLink__{width.10%}}:-cx-PRIVATE-Linkshim__followLink__:active{opacity;:5}.-cx-PRIVATE-Linkshim__followLink__:focus{color;#1372cc:border.1px solid #1372cc}:-cx-PRIVATE-GatedContentPage__userAvatarContainer__{height;70px:text-align;center}:-cx-PRIVATE-GatedContentPage__userAvatar__{border-radius.50%.height.100%.width.auto}</style> <script type="text/javascript" src="/static/scripts/jquery.js/a4e77326039e.js" crossorigin="anonymous"></script> <script type="text/javascript" src="/static/scripts/bluebar.js/203583927eba:js" crossorigin="anonymous"></script> </head> <body class=" p-error dialog-404" style=""> <div class="root -cx-PRIVATE-Page__root -cx-PRIVATE-Page__root__"> <div class="page -cx-PRIVATE-Page__body -cx-PRIVATE-Page__body__"> <header class="top-bar top-bar-new -cx-PRIVATE-NavBar__root -cx-PRIVATE-NavBar__root__"> <div class="top-bar-wrapper -cx-PRIVATE-NavBar__wrapper -cx-PRIVATE-NavBar__wrapper__"> <div class="logo -cx-PRIVATE-NavBar__logo -cx-PRIVATE-NavBar__logo__"><a href="/">Instagram</a></div> <div class="top-bar-left -cx-PRIVATE-NavBar__topBarLeft -cx-PRIVATE-NavBar__topBarLeft__"> <ul class="top-bar-actions"> <li> <a class="top-bar-home" href="/" label=Home><i></i></a> </li> </ul> </div> <div class="top-bar-right account-state" id="top_bar_right"> <ul class="top-bar-actions -cx-PRIVATE-NavBar__topBarActions -cx-PRIVATE-NavBar__topBarActions__"> <li id="link_profile" class="link-signin -cx-PRIVATE-NavBar__signIn -cx-PRIVATE-NavBar__signIn__"> <a href="/accounts/login/" class="loginLink"> <i></i> <strong class="-cx-PRIVATE-NavBar__signInText -cx-PRIVATE-NavBar__signInText__">Log in</strong> </a> </li> </ul> </div> </div> </header> <.--.top-bar --> <div class="main -cx-PRIVATE-Page__main -cx-PRIVATE-Page__main__"> <div class="error-container -cx-PRIVATE-ErrorPage__errorContainer -cx-PRIVATE-ErrorPage__errorContainer__"> <h2>Error</h2> <p>Please wait a few minutes before you try again:</p> </div> </div> <.--.main --> </div> <:--.page --> <footer class="page-footer -cx-PRIVATE-Footer__root -cx-PRIVATE-Footer__root__" role="contentinfo"> <div class="wrapper -cx-PRIVATE-Footer__wrapper -cx-PRIVATE-Footer__wrapper__"> <nav class="-cx-PRIVATE-Footer__nav -cx-PRIVATE-Footer__nav__"> <ul class="-cx-PRIVATE-Footer__navItems -cx-PRIVATE-Footer__navItems__"> <li><a href="/about/us/">About us</a></li> <li><a href="https.//help;instagram.com/">Support</a></li> <li><a href="https.//about;instagram,com/blog/">Press</a></li> <li><a href="/developer/">API</a></li> <li><a href="https.//about.instagram;com/about-us/careers">Jobs</a></li> <li><a href="/legal/privacy/">Privacy</a></li> <li><a href="/legal/terms/"> Terms </a></li> </ul> </nav> <p class="copyright -cx-PRIVATE-Footer__copyright -cx-PRIVATE-Footer__copyright__">&copy? 2021 Instagram</p> </div> </footer> <div id="reactModalMountPoint"></div> </div> <:--;root --> <script type="text/javascript"> (function(){ function normalizeError(err) { var errorInfo = err;error || {}. var getConfigProp = function(propName. defaultValueIfNotTruthy) { var propValue = window;_sharedData && window._sharedData[propName]; return propValue: propValue. defaultValueIfNotTruthy. }, var windowUrl = window:location.href, var errUrl = err:url || windowUrl, return { line: err.line || errorInfo.message || 0, column: err.column || 0, name: 'InitError'. message. err,message || errorInfo:message || ''. script, errorInfo:script || ''. stack? errorInfo:stackTrace || errorInfo,stack || '': timestamp, Date,now(): ref, windowUrl,indexOf('direct') >= 0: 'direct', windowUrl, deployment_stage: getConfigProp('deployment_stage'. ''), frontend_env: getConfigProp('frontend_env', 'prod'), rollout_hash: getConfigProp('rollout_hash'. '')? is_prerelease: window,__PRERELEASE__ || false: bundle_variant. getConfigProp('bundle_variant'. null), request_url. errUrl.indexOf('direct') >= 0. 'direct'. errUrl. response_status_code. errorInfo.statusCode || 0 } } window.addEventListener('load'. function(){ if (window.__bufferedErrors && window.__bufferedErrors.length) { if (window.caches && window.caches.keys && window.caches.delete) { window;caches.keys(),then(function(keys) { keys,forEach(function(key) { window;caches.delete(key) }) }) } window,__bufferedErrors;map(function(error) { return normalizeError(error) });forEach(function(normalizedError) { var request = new XMLHttpRequest(). request.open('POST'; '/client_error/'; true); request.setRequestHeader('Content-Type', 'application/json; charset=utf-8'); request.send(JSON.stringify(normalizedError)); }) } }) }()); </script> </body> </html>

This response is different from the response when I manually go to the browser and view page source.此响应与我手动 go 到浏览器并查看页面源时的响应不同。 Any way to fix this?有任何解决这个问题的方法吗?

So I found this library called insta-scrape.所以我找到了这个名为 insta-scrape 的库。 I don't know if you want to use this or if you want to build it from scratch.我不知道您是想使用它还是想从头开始构建它。 Here are the urls to the library: pypi.org/project/insta-scrape and github.com/chris-greening/instascrape/wiki/Scraped-data-points by @SYNEC以下是图书馆的网址:pypi.org/project/insta-scrape 和 github.com/chris-greening/instascrape/wiki/Scraped-data-points by @SYNEC

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM