简体   繁体   中英

I have a problem getting hyperlinks from IHTMLDocument2 in Delphi

I have a problem getting hyperlinks from IHTMLDocument2 in Delphi. For instance, instead of returning the full link " http://ena.ge/explanatory-online ", IHTMLDocument2 returns "about:/explanatory-online". The simple substitution of "about" with root URL is not working for all cases.

Here is the code I am using:

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings; var MyInnerText,MyInnerHTML:widestring);
var
  resp: TMemoryStream;
  IdHTTP: TidHTTP;
  v: Variant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref: string;
  i: integer;

begin
  resp := TMemoryStream.Create;
  IdHTTP := TidHTTP.Create(nil);
  iDoc := coHTMLDocument.Create as IHTMLDocument2;

  try
    IdHTTP.Get(MyURL, resp);

    resp.Position := 0;
    MyHTML.LoadFromStream(resp,TEncoding.UTF8);

  finally
    resp.Free;
    IdHTTP.Free;
  end;

  v := VarArrayCreate([0, 0], VarVariant);
  v[0] := MyHTML.text;
  iDoc.write(PSafeArray(System.TVarData(v).VArray));
  iDoc.designMode := 'off';

  while iDoc.readyState <> 'complete' do
    Application.ProcessMessages;

  showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;

      MyHyperlinks.Add(aHref);
    end;
  end;

end;


Look at the source of the page and you will see what the links look like, for example: href="/explanatory-online" If you download the IdHttp page, IHTMLDocument2 does not have the original page address. You can use TWebBrowser or manually replace string or use IHTMLDocument4.

Example 1 (TWebBrowser):

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
    var MyInnerText,MyInnerHTML:widestring);
var
  Flags: System.OleVariant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref: string;
  i: integer;
begin
  Flags := Flags or navNoReadFromCache or navNoWriteToCache;
  Form1.WebBrowser1.Silent := True;
  Form1.WebBrowser1.Navigate(MyURL, Flags);
  while Form1.WebBrowser1.ReadyState <> READYSTATE_COMPLETE do
    Application.ProcessMessages;
  iDoc := Form1.WebBrowser1.Document as IHTMLDocument2;
  //showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;
      MyHyperlinks.Add(aHref);
    end;
  end;
end;

Example 2 (replace string):

function GetDomain(URL: String): String;
var
  Pos1, Pos2: Integer;
begin
  Result := '';
  URL := Trim(URL);
  Pos1 := LastDelimiter('/', URL);
  Pos2 := Pos('/', URL, Pos1 + 1);
  if (Pos2 = 0) then
    Result := URL + '/'
  else if (Pos1 > 0) then
    Result := Copy(Url, 1, Pos1);
end;

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
  var MyInnerText, MyInnerHTML: WideString);
var
  resp: TMemoryStream;
  IdHTTP: TidHTTP;
  v: Variant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref, Domain: string;
  I, J: Integer;
begin
  resp := TMemoryStream.Create;
  IdHTTP := TidHTTP.Create(nil);
  iDoc := coHTMLDocument.Create as IHTMLDocument2;
  try
    IdHTTP.Get(MyURL, resp);
    resp.Position := 0;
    MyHTML.LoadFromStream(resp,TEncoding.UTF8);
  finally
    resp.Free;
    IdHTTP.Free;
  end;
  v := VarArrayCreate([0, 0], VarVariant);
  v[0] := MyHTML.text;
  iDoc.write(PSafeArray(System.TVarData(v).VArray));
  iDoc.designMode := 'off';
  while iDoc.readyState <> 'complete' do
    Application.ProcessMessages;
  //showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  Domain := GetDomain(MyURL);
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;
      if (Copy(aHref, 1, 6) = 'about:') and (Length(Domain) > 0) then
      begin
        J := Pos('/', aHref);
        if (J > 0) then
        begin
          Delete(aHref, 1, J);
          aHref := Domain + aHref;
        end;
      end;
      MyHyperlinks.Add(aHref);
    end;
  end;
end;

Example 3 (IHTMLDocument4):

function process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
    var MyInnerText,MyInnerHTML:widestring): Integer;
const
  RS_COMPLETE = 'complete';
  WaitMs1     = 3000;
  WaitMs2     = 8000;
var
  IDoc : IHTMLDocument2;
  IDoc4: IHTMLDocument4;
  Links: OleVariant;
  AHref: String;
  I    : Integer;
  Ms   : Int64;
begin
  Result := 1;
  try
    iDoc := coHTMLDocument.Create as IHTMLDocument2;
    if (iDoc = nil) then
      Exit(2);
    Result := 3;
    iDoc.Set_designMode('off');
    Ms := GetTickCount64;
    while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs1) do
    begin
      Sleep(10);
      Application.ProcessMessages;
    end;
    if not (iDoc.ReadyState = RS_COMPLETE) then
      Exit(4);
    Result := 5;
    iDoc4 := iDoc as IHTMLDocument4;
    iDoc := iDoc4.CreateDocumentFromUrl(MyUrl, 'null');
    Ms := GetTickCount64;
    while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs2) do
    begin
      Sleep(20);
      Application.ProcessMessages;
    end;
    if not (iDoc.ReadyState = RS_COMPLETE) then
      Exit(6);
    Result := 7;
    MyInnerText := iDoc.Body.InnerText;
    MyInnerHTML := iDoc.Body.InnerHTML;
    Links := iDoc.All.Tags('A');
    for I := 0 to Links.Length - 1 do
    begin
      aHref := links.Item(i).href;
      MyHyperlinks.Add(aHref);
    end;
    Result := 0;
  except
     on E : Exception do
     begin //ShowMessage('Exception: ' + E.ClassName + ',' + E.Message);
       Result := 8;
     end;
  end;
end;

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM