[英]What is the best way to scrape data from from multiple websites?
我尝试过的是,我使用 volley 请求从第一个网站抓取,并在其中为每个网站创建了多个线程,在每个线程中我使用 jsoup connect 方法来抓取而不是 volley。 它完成工作,实际上更快。 但是,问题在于它在抓取数据直到完全加载时冻结应用程序。 它冻结了进度条,我无法找到原因。
这是我实现的代码。 有点长。
// Checking the connection
final StringRequest request = new StringRequest("https://www.google.com/", new Response.Listener<String>() {
@Override
public void onResponse(String response) {
relativeLayout.setVisibility(View.GONE);
// instances for each required website
final HimalayanTimes himalayanTimes = new HimalayanTimes(getContext());
final GsmArena gsmArena = new GsmArena();
final CinemaBlend cinemaBlend = new CinemaBlend();
final KathmanduPost kathmanduPost = new KathmanduPost(getContext());
final GlobalNews globalNews = new GlobalNews();
final NepaliTimes nepaliTimes = new NepaliTimes(getContext());
final GoalNepal goalNepal = new GoalNepal(getContext());
final GadgetByte gadgetByte = new GadgetByte();
final TechLekh techLekh = new TechLekh();
final OnlineKhabar onlineKhabar = new OnlineKhabar();
final NepaliSansar nepaliSansar = new NepaliSansar();
final CricketingNepal cricketingNepal = new CricketingNepal();
// thread for each website
// thread fot thehimalayantimes
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> himalyannews;
himalyannews = himalayanTimes.getNews();
news.addAll(himalyannews);
for(int i=0; i<4; i++){
finalHeadlines.add(himalyannews.get(i));
}
} catch (Exception ignored) {
}
}
});
thread.start();
// thread for gsmArena
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> gsmarenanews;
gsmarenanews = gsmArena.getNews();
news.addAll(gsmarenanews);
for(int i=0; i<3; i++){
headlines.add(gsmarenanews.get(i));
}
} catch (Exception ignored) {
}
}
});
thread1.start();
// thread for cinemaBlend
Thread thread2 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> cinemablendnews;
cinemablendnews = cinemaBlend.getNews();
news.addAll(cinemablendnews);
for(int i=0; i<4; i++){
headlines.add(cinemablendnews.get(i));
}
} catch (Exception ignored) {
}
}
});
thread2.start();
// thread for kathmanduPost
Thread thread3 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> kathmandupostnews;
kathmandupostnews = kathmanduPost.getNews();
news.addAll(kathmandupostnews);
for(int i=0; i<3; i++){
finalHeadlines.add(kathmandupostnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread3.start();
// thread for globalNews
Thread thread4 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> globalnewsnews;
globalnewsnews = globalNews.getNews();
news.addAll(globalnewsnews);
for(int i=0; i<5; i++){
finalHeadlines.add(globalnewsnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread4.start();
// thread for nepaliTimes
Thread thread5 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> nepalitimesnews;
nepalitimesnews = nepaliTimes.getNews();
news.addAll(nepalitimesnews);
for(int i=0; i<3; i++){
finalHeadlines.add(nepalitimesnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread5.start();
// thread for GoalNepal
Thread thread6 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> goalNepalNews;
goalNepalNews = goalNepal.getNews();
news.addAll(goalNepalNews);
for (int i=0; i<4; i++){
headlines.add(goalNepalNews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread6.start();
// thread for GadgetByteNepal
Thread thread7 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> gadgetbytenews;
gadgetbytenews = gadgetByte.getNews();
news.addAll(gadgetbytenews);
for (int i=0; i<3; i++){
headlines.add(gadgetbytenews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread7.start();
// thread for Techlekh
Thread thread8 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> techlekhnews;
techlekhnews = techLekh.getNews();
news.addAll(techlekhnews);
for (int i=0; i<3; i++){
headlines.add(techlekhnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread8.start();
// thread for onlinekhabar
Thread thread9 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> onlineKhabarnews;
onlineKhabarnews = onlineKhabar.getNews();
news.addAll(onlineKhabarnews);
for (int i=0; i<4; i++){
finalHeadlines.add(onlineKhabarnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread9.start();
//thread for nepalisansar
Thread thread11 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> nepalisansarnews;
nepalisansarnews = nepaliSansar.getNews();
news.addAll(nepalisansarnews);
for (int i=0; i<4; i++){
finalHeadlines.add(nepalisansarnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread11.start();
// thread for cricketingNepal
Thread thread12 = new Thread(new Runnable() {
@Override
public void run() {
try {
ArrayList<NewsItem> cricketnews;
cricketnews = cricketingNepal.getNews();
news.addAll(cricketnews);
for (int i=0; i<4; i++){
headlines.add(cricketnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread12.start();
// main thread wait for each thread to finish
try {
thread.join();
} catch (InterruptedException ignored) {
}
try {
thread1.join();
} catch (InterruptedException ignored) {
}
try {
thread2.join();
} catch (InterruptedException ignored) {
}
try {
thread3.join();
} catch (InterruptedException ignored) {
}
try {
thread4.join();
} catch (InterruptedException ignored) {
}
try {
thread5.join();
} catch (InterruptedException ignored) {
}
try {
thread6.join();
} catch (InterruptedException ignored) {
}
try {
thread7.join();
} catch (InterruptedException ignored) {
}
try {
thread8.join();
} catch (InterruptedException ignored) {
}
try {
thread8.join();
} catch (InterruptedException ignored) {
}
try {
thread9.join();
} catch (InterruptedException ignored) {
}
try {
thread11.join();
} catch (InterruptedException ignored) {
}
try {
thread12.join();
} catch (InterruptedException ignored) {
}
for(NewsItem item : news){
if (item.tag.contains("kathmandu"))
nepal.add(item);
if (item.tag.contains("cricket"))
sports.add(item);
if (item.tag.contains("football"))
sports.add(item);
switch (item.tag) {
case "nepal":
nepal.add(item);
break;
case "world":
world.add(item);
break;
case "sports":
sports.add(item);
break;
case "tech":
tech.add(item);
break;
case "entertainment":
entertainment.add(item);
break;
}
}
// putting each news item to the main container
Collections.shuffle(headlines);
Collections.shuffle(finalHeadlines);
finalHeadlines.addAll(headlines);
Collections.shuffle(nepal);
Collections.shuffle(world);
Collections.shuffle(sports);
Collections.shuffle(tech);
Collections.shuffle(entertainment);
tab1 t1 = new tab1(finalHeadlines);
t1.setRetainInstance(true);
tab2 t2 = new tab2(nepal);
t2.setRetainInstance(true);
tab3 t3 = new tab3(world);
t3.setRetainInstance(true);
tab4 t4 = new tab4(sports);
t4.setRetainInstance(true);
tab5 t5 = new tab5(tech);
t5.setRetainInstance(true);
tab6 t6 = new tab6(entertainment);
t6.setRetainInstance(true);
assert getFragmentManager() != null;
pagerAdapter = new PageAdapter(finalHeadlines, nepal, world, sports, tech, entertainment, getFragmentManager(), tabLayout.getTabCount());
viewPager.setAdapter(pagerAdapter);
shimmerFrameLayout.setVisibility(View.GONE);
}
}, new Response.ErrorListener() {
@Override
public void onErrorResponse(VolleyError error) {
Toast.makeText(getContext(), "Internet Connection Error!", Toast.LENGTH_SHORT).show();
shimmerFrameLayout.setVisibility(View.GONE);
tabLayout.setVisibility(View.GONE);
}
});
queue.add(request);
对于每个网站,我制作了 class。以下 class 之一:-
public class CinemaBlend {
ArrayList<NewsItem> news;
public CinemaBlend() {
news = new ArrayList<>();
}
@RequiresApi(api = Build.VERSION_CODES.KITKAT)
public ArrayList<NewsItem> getNews() throws IOException{
String url = "https://www.cinemablend.com/news.php";
OkHttpClient okHttpClient = new OkHttpClient();
Request request = new Request.Builder().url(url).get().build();
Document document = Jsoup.parse(Objects.requireNonNull(okHttpClient.newCall(request).execute().body()).string());
Elements articles = document.select("div.order-of-type-2").select("div.story-related").select("a");
for(Element article : articles)
{
String link = article.attr("href");
String title = article.attr("title");
String img = article.select("div.story-related-content").select("span.story-cover-image").select("img").attr("data-src");
String date = article.select("span.story-related-published-date").text();
NewsItem newsItem = new NewsItem();
newsItem.imgsrc = img;
newsItem.title = title;
newsItem.link = link;
newsItem.tag = "entertainment";
newsItem.publisher = "cinemablend.com";
newsItem.source_logo = "https://image.pitchbook.com/WFQVGYL17V0MevlcfQKlWjC3E8K1447542818374_200x200";
if(!date.equals(""))
{
newsItem.date = date + " ago";
news.add(newsItem);
}
}
return news;
}
}
查找说明如何执行后台工作的教程。 有很多不同的方法可以做到这一点:服务、Kotlin 协程、简单的自我管理线程等。
远离有关 AsyncTasks 和 Loaders(已弃用)的教程。
Android 开发人员指南是一个很好的起点: https://developer.android.com/guide/background
但是在异步任务中执行任务会产生另一个类似的问题。 当我在后台创建类似的线程时,会出现完全类似的问题,它会跳过数据并且 ui 不会更新。 任何建议将不胜感激。
public class DownloadNews extends AsyncTask<Void, Void, Void>
{
@Override
protected void onPreExecute() {
shimmerFrameLayout.setVisibility(View.VISIBLE);
relativeLayout.setVisibility(View.GONE);
tabLayout.setVisibility(View.GONE);
}
@RequiresApi(api = Build.VERSION_CODES.KITKAT)
@Override
protected Void doInBackground(Void... voids) {
tabLayout.setOnTabSelectedListener(new TabLayout.OnTabSelectedListener() {
@Override
public void onTabSelected(TabLayout.Tab tab) {
viewPager.setCurrentItem(tab.getPosition());
}
@Override
public void onTabUnselected(TabLayout.Tab tab) {
}
@Override
public void onTabReselected(TabLayout.Tab tab) {
}
});
viewPager.addOnPageChangeListener(new TabLayout.TabLayoutOnPageChangeListener(tabLayout));
final RequestQueue queue = Volley.newRequestQueue(Objects.requireNonNull(getContext()));
// Checking the connection
final StringRequest request = new StringRequest("https://www.google.com/", new Response.Listener<String>() {
@Override
public void onResponse(String response) {
// instances for each required website
final HimalayanTimes himalayanTimes = new HimalayanTimes(getContext());
final GsmArena gsmArena = new GsmArena();
final CinemaBlend cinemaBlend = new CinemaBlend();
final KathmanduPost kathmanduPost = new KathmanduPost(getContext());
final GlobalNews globalNews = new GlobalNews();
final NepaliTimes nepaliTimes = new NepaliTimes(getContext());
final GoalNepal goalNepal = new GoalNepal(getContext());
final GadgetByte gadgetByte = new GadgetByte();
final TechLekh techLekh = new TechLekh();
final OnlineKhabar onlineKhabar = new OnlineKhabar();
final NepaliSansar nepaliSansar = new NepaliSansar();
final CricketingNepal cricketingNepal = new CricketingNepal();
// thread for each website
// thread fot thehimalayantimes
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
try {
himalyannews = himalayanTimes.getNews();
news.addAll(himalyannews);
for(int i=0; i<4; i++){
finalHeadlines.add(himalyannews.get(i));
}
} catch (Exception ignored) {
}
}
});
thread.start();
// thread for gsmArena
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
try {
gsmarenanews = gsmArena.getNews();
news.addAll(gsmarenanews);
for(int i=0; i<3; i++){
headlines.add(gsmarenanews.get(i));
}
} catch (Exception ignored) {
}
}
});
thread1.start();
// thread for cinemaBlend
Thread thread2 = new Thread(new Runnable() {
@RequiresApi(api = Build.VERSION_CODES.KITKAT)
@Override
public void run() {
try {
cinemablendnews = cinemaBlend.getNews();
news.addAll(cinemablendnews);
for(int i=0; i<4; i++){
headlines.add(cinemablendnews.get(i));
}
} catch (Exception ignored) {
}
}
});
thread2.start();
// thread for kathmanduPost
Thread thread3 = new Thread(new Runnable() {
@RequiresApi(api = Build.VERSION_CODES.KITKAT)
@Override
public void run() {
try {
kathmandupostnews = kathmanduPost.getNews();
news.addAll(kathmandupostnews);
for(int i=0; i<3; i++){
finalHeadlines.add(kathmandupostnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread3.start();
// thread for globalNews
Thread thread4 = new Thread(new Runnable() {
@Override
public void run() {
try {
globalnewsnews = globalNews.getNews();
news.addAll(globalnewsnews);
for(int i=0; i<5; i++){
finalHeadlines.add(globalnewsnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread4.start();
// thread for nepaliTimes
Thread thread5 = new Thread(new Runnable() {
@Override
public void run() {
try {
nepalitimesnews = nepaliTimes.getNews();
news.addAll(nepalitimesnews);
for(int i=0; i<3; i++){
finalHeadlines.add(nepalitimesnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread5.start();
// thread for GoalNepal
Thread thread6 = new Thread(new Runnable() {
@Override
public void run() {
try {
goalNepalNews = goalNepal.getNews();
news.addAll(goalNepalNews);
for (int i=0; i<4; i++){
headlines.add(goalNepalNews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread6.start();
// thread for GadgetByteNepal
Thread thread7 = new Thread(new Runnable() {
@Override
public void run() {
try {
gadgetbytenews = gadgetByte.getNews();
news.addAll(gadgetbytenews);
for (int i=0; i<3; i++){
headlines.add(gadgetbytenews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread7.start();
// thread for Techlekh
Thread thread8 = new Thread(new Runnable() {
@Override
public void run() {
try {
techlekhnews = techLekh.getNews();
news.addAll(techlekhnews);
for (int i=0; i<3; i++){
headlines.add(techlekhnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread8.start();
// thread for onlinekhabar
Thread thread9 = new Thread(new Runnable() {
@Override
public void run() {
try {
onlineKhabarnews = onlineKhabar.getNews();
news.addAll(onlineKhabarnews);
for (int i=0; i<4; i++){
finalHeadlines.add(onlineKhabarnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread9.start();
//thread for nepalisansar
Thread thread11 = new Thread(new Runnable() {
@Override
public void run() {
try {
nepalisansarnews = nepaliSansar.getNews();
news.addAll(nepalisansarnews);
for (int i=0; i<4; i++){
finalHeadlines.add(nepalisansarnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread11.start();
// thread for cricketingNepal
Thread thread12 = new Thread(new Runnable() {
@Override
public void run() {
try {
cricketnews = cricketingNepal.getNews();
news.addAll(cricketnews);
for (int i=0; i<4; i++){
headlines.add(cricketnews.get(i));
}
} catch (IOException ignored) {
}
}
});
thread12.start();
}
}, new Response.ErrorListener() {
@Override
public void onErrorResponse(VolleyError error) {
Toast.makeText(getContext(), "Internet Connection Error!", Toast.LENGTH_SHORT).show();
shimmerFrameLayout.setVisibility(View.GONE);
relativeLayout.setVisibility(View.VISIBLE);
tabLayout.setVisibility(View.GONE);
}
});
queue.add(request);
return null;
}
@Override
protected void onPostExecute(Void aVoid) {
tabLayout.setVisibility(View.VISIBLE);
shimmerFrameLayout.setVisibility(View.GONE);
for(NewsItem item : news){
if (item.tag.contains("kathmandu"))
nepal.add(item);
switch (item.tag) {
case "nepal":
nepal.add(item);
break;
case "world":
world.add(item);
break;
case "sports":
sports.add(item);
break;
case "tech":
tech.add(item);
break;
case "entertainment":
entertainment.add(item);
break;
}
}
// putting each news item to the main container
Collections.shuffle(headlines);
Collections.shuffle(finalHeadlines);
finalHeadlines.addAll(headlines);
Collections.shuffle(nepal);
Collections.shuffle(world);
Collections.shuffle(sports);
Collections.shuffle(tech);
Collections.shuffle(entertainment);
tab1 t1 = new tab1(finalHeadlines);
t1.setRetainInstance(true);
tab2 t2 = new tab2(nepal);
t2.setRetainInstance(true);
tab3 t3 = new tab3(world);
t3.setRetainInstance(true);
tab4 t4 = new tab4(sports);
t4.setRetainInstance(true);
tab5 t5 = new tab5(tech);
t5.setRetainInstance(true);
tab6 t6 = new tab6(entertainment);
t6.setRetainInstance(true);
shimmerFrameLayout.setVisibility(View.GONE);
assert getFragmentManager() != null;
pagerAdapter = new PageAdapter(finalHeadlines, nepal, world, sports, tech, entertainment, getFragmentManager(), tabLayout.getTabCount());
viewPager.setAdapter(pagerAdapter);
}
}
适配器的代码与上面完全相似。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.