How can I get the a[href] links of the selected value from crawler(): listModel
and use the link in openie(): Jsoup.connect()
?
Here is my code:
private void btnExtractActionPerformed(java.awt.event.ActionEvent evt) {
try {
openie();
} catch (IOException ex) {
Logger.getLogger(MainUI.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void btnSearchActionPerformed(java.awt.event.ActionEvent evt) {
try {
crawler();
} catch (IOException ex) {
Logger.getLogger(MainUI.class.getName()).log(Level.SEVERE, null, ex);
}
}
private String subject, object, link;
private void crawler() throws IOException {
//replace space with "+"
String input = txtSearch.getText().replace(" ", "+");
int count = 0;
DefaultListModel<String> listModel = new DefaultListModel<>();
while (count <= 20) {
String url = "https://www.google.com/search?q=" + input + "&tbm=nws&source=lnm&start=" + count;
Document doc = Jsoup.connect(url).userAgent("Mozilla").timeout(10000).get();
Elements e = doc.select("div.g a[href]");
for (Element e1 : e) {
listModel.addElement(e1.text());
link = e1.absUrl("href");
}
count += 10;
}
newsList.setModel(listModel);
}
private void openie() throws IOException {
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
List<String> list = newsList.getSelectedValuesList();
Document d = Jsoup.connect(link).userAgent("Mozilla").timeout(10000).get();
DefaultListModel<String> extractedList = new DefaultListModel<>();
for (String selected : list) {
extractedList.addElement(selected);
selected = d.body().text();
Annotation doc = new Annotation(selected);
pipeline.annotate(doc);
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
// Get the OpenIE triples for the sentence
Collection<RelationTriple> triples
= sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
// Print the triples
for (RelationTriple triple : triples) {
subject = triple.subjectLemmaGloss().replace(" ", "_").toLowerCase();
object = triple.objectLemmaGloss().replace(" ", "_").toLowerCase();
extractedList.addElement(triple.confidence + "\t"
+ triple.subjectLemmaGloss() + "\t"
+ triple.relationLemmaGloss() + "\t"
+ triple.objectLemmaGloss());
}
}
}
tuplesList.setModel(extractedList);
}
With my code, I parse through an URL and able to get a [href]. But is the last element's a[href] of all elements that were traversed through. I am unable to store a[href] of each element that is retrieved.
You need List variable to store url like as below.
Elements links = doc.select("div.g a[href]");
List<String> urls = links.stream().map(i -> i.absUrl("href")).collect(Collectors.toList());
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.