delphi - 我在 Delphi 中从 IHTMLDocument2 获取超链接时遇到问题
问题描述
我在 Delphi 中从 IHTMLDocument2 获取超链接时遇到问题。例如,IHTMLDocument2 不是返回完整链接“ http://ena.ge/explanatory-online ”,而是返回“about:/explanatory-online”。用根 URL 简单地替换“about”并不适用于所有情况。
这是我正在使用的代码:
procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings; var MyInnerText,MyInnerHTML:widestring);
var
resp: TMemoryStream;
IdHTTP: TidHTTP;
v: Variant;
iDoc: IHTMLDocument2;
links: OleVariant;
MyHyperlink, aHref: string;
i: integer;
begin
resp := TMemoryStream.Create;
IdHTTP := TidHTTP.Create(nil);
iDoc := coHTMLDocument.Create as IHTMLDocument2;
try
IdHTTP.Get(MyURL, resp);
resp.Position := 0;
MyHTML.LoadFromStream(resp,TEncoding.UTF8);
finally
resp.Free;
IdHTTP.Free;
end;
v := VarArrayCreate([0, 0], VarVariant);
v[0] := MyHTML.text;
iDoc.write(PSafeArray(System.TVarData(v).VArray));
iDoc.designMode := 'off';
while iDoc.readyState <> 'complete' do
Application.ProcessMessages;
showmessage(idoc.url);
MyInnerText:=idoc.body.innerText;
MyInnerHTML:=idoc.body.innerHTML;
links := iDoc.all.tags('A');
if links.Length > 0 then
begin
for i := 0 to -1 + links.Length do
begin
aHref := links.Item(i).href;
MyHyperlinks.Add(aHref);
end;
end;
end;
解决方案
查看页面的来源,您会看到链接是什么样的,例如: href="/explanatory-online" 如果您下载 IdHttp 页面,则 IHTMLDocument2 没有原始页面地址。您可以使用 TWebBrowser 或手动替换字符串或使用 IHTMLDocument4。
示例 1(TWebBrowser):
procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
var MyInnerText,MyInnerHTML:widestring);
var
Flags: System.OleVariant;
iDoc: IHTMLDocument2;
links: OleVariant;
MyHyperlink, aHref: string;
i: integer;
begin
Flags := Flags or navNoReadFromCache or navNoWriteToCache;
Form1.WebBrowser1.Silent := True;
Form1.WebBrowser1.Navigate(MyURL, Flags);
while Form1.WebBrowser1.ReadyState <> READYSTATE_COMPLETE do
Application.ProcessMessages;
iDoc := Form1.WebBrowser1.Document as IHTMLDocument2;
//showmessage(idoc.url);
MyInnerText:=idoc.body.innerText;
MyInnerHTML:=idoc.body.innerHTML;
links := iDoc.all.tags('A');
if links.Length > 0 then
begin
for i := 0 to -1 + links.Length do
begin
aHref := links.Item(i).href;
MyHyperlinks.Add(aHref);
end;
end;
end;
示例 2(替换字符串):
function GetDomain(URL: String): String;
var
Pos1, Pos2: Integer;
begin
Result := '';
URL := Trim(URL);
Pos1 := LastDelimiter('/', URL);
Pos2 := Pos('/', URL, Pos1 + 1);
if (Pos2 = 0) then
Result := URL + '/'
else if (Pos1 > 0) then
Result := Copy(Url, 1, Pos1);
end;
procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
var MyInnerText, MyInnerHTML: WideString);
var
resp: TMemoryStream;
IdHTTP: TidHTTP;
v: Variant;
iDoc: IHTMLDocument2;
links: OleVariant;
MyHyperlink, aHref, Domain: string;
I, J: Integer;
begin
resp := TMemoryStream.Create;
IdHTTP := TidHTTP.Create(nil);
iDoc := coHTMLDocument.Create as IHTMLDocument2;
try
IdHTTP.Get(MyURL, resp);
resp.Position := 0;
MyHTML.LoadFromStream(resp,TEncoding.UTF8);
finally
resp.Free;
IdHTTP.Free;
end;
v := VarArrayCreate([0, 0], VarVariant);
v[0] := MyHTML.text;
iDoc.write(PSafeArray(System.TVarData(v).VArray));
iDoc.designMode := 'off';
while iDoc.readyState <> 'complete' do
Application.ProcessMessages;
//showmessage(idoc.url);
MyInnerText:=idoc.body.innerText;
MyInnerHTML:=idoc.body.innerHTML;
links := iDoc.all.tags('A');
Domain := GetDomain(MyURL);
if links.Length > 0 then
begin
for i := 0 to -1 + links.Length do
begin
aHref := links.Item(i).href;
if (Copy(aHref, 1, 6) = 'about:') and (Length(Domain) > 0) then
begin
J := Pos('/', aHref);
if (J > 0) then
begin
Delete(aHref, 1, J);
aHref := Domain + aHref;
end;
end;
MyHyperlinks.Add(aHref);
end;
end;
end;
示例 3(IHTMLDocument4):
function process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
var MyInnerText,MyInnerHTML:widestring): Integer;
const
RS_COMPLETE = 'complete';
WaitMs1 = 3000;
WaitMs2 = 8000;
var
IDoc : IHTMLDocument2;
IDoc4: IHTMLDocument4;
Links: OleVariant;
AHref: String;
I : Integer;
Ms : Int64;
begin
Result := 1;
try
iDoc := coHTMLDocument.Create as IHTMLDocument2;
if (iDoc = nil) then
Exit(2);
Result := 3;
iDoc.Set_designMode('off');
Ms := GetTickCount64;
while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs1) do
begin
Sleep(10);
Application.ProcessMessages;
end;
if not (iDoc.ReadyState = RS_COMPLETE) then
Exit(4);
Result := 5;
iDoc4 := iDoc as IHTMLDocument4;
iDoc := iDoc4.CreateDocumentFromUrl(MyUrl, 'null');
Ms := GetTickCount64;
while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs2) do
begin
Sleep(20);
Application.ProcessMessages;
end;
if not (iDoc.ReadyState = RS_COMPLETE) then
Exit(6);
Result := 7;
MyInnerText := iDoc.Body.InnerText;
MyInnerHTML := iDoc.Body.InnerHTML;
Links := iDoc.All.Tags('A');
for I := 0 to Links.Length - 1 do
begin
aHref := links.Item(i).href;
MyHyperlinks.Add(aHref);
end;
Result := 0;
except
on E : Exception do
begin //ShowMessage('Exception: ' + E.ClassName + ',' + E.Message);
Result := 8;
end;
end;
end;
推荐阅读
- javascript - 用于从剪贴板内容制作 URL 的书签
- web-applications - Azure DevOps-Linux webapp 不显示
- javascript - 控制台对象中的换行符
- python - seaborn 更改 clustermap 可视化选项而不重做聚类
- hibernate - Kotlin Hibernate JPA Lazy fetch 无法通过控制器工作
- javascript - 与 TabBarIOS 一起使用时无法将组件正确传递给 NavigatorIOS
- java - Spring批处理JobRepository位置和缩放
- javascript - 在javascript / jquery中将2个数组与对象连接起来
- google-cloud-ml - Google ML Engine 是否支持贝叶斯超参数调整算法?
- html - 具有预先确定宽度的 Div 内具有背景颜色的 Div