首页 > 解决方案 > 如何使用 Visual Basic 从在线网站提取数据并将数据放入文本框或通过附加到 MSSQL 数据库的网格视图?

问题描述

<%@ Page Title="" Language="vb" AutoEventWireup="false" MasterPageFile="~/InjuryTracker.Master" CodeBehind="WebCrawler.aspx.vb" %>

<asp:Content ID="Content1" ContentPlaceHolderID="head" runat="server">
    </asp:Content>
<asp:Content ID="Content2" ContentPlaceHolderID="ContentPlaceHolder1" runat="server">
    <br />
<div align= "center>
</div>
<br />
<h2>Players' Injuries</h2>
<br />
<br />
<div align="center">
    <asp:TextBox ID="TextBox1" runat="server" Height="264px" Width="664px"></asp:TextBox>
</div>
<br />
<br />
<div align="center">
    <asp:Button ID="Button1" runat="server" Text="Button" />
</div> 
<br />
<br />
<br />
<div align="center">
    <asp:TextBox ID="TextBox2" runat="server" Height="264px" Width="664px"></asp:TextBox>
</div>
    <br />
    <br />
    <br />
    <br />
    <br />
</asp:Content>


<WebCrawler.aspx.vb>

Imports System.Text
Imports System.Net
Imports System.IO
Imports System.Text.RegularExpressions

Public Class WebCrawler
    Inherits System.Web.UI.Page

    Protected Sub Page_Load(ByVal sender As Object, ByVal e As System.EventArgs) Handles MyBase.Load

    End Sub


    Private Sub Scrape()

        Try

            Dim strURL As String = "https://www.cbssports.com/nba/injuries/"

            Dim strOutput As String = ""

            Dim wrResponse As WebResponse
            Dim wrRequest As WebRequest = HttpWebRequest.Create(strURL)

            TextBox1.Text = "Extracting..." & Environment.NewLine

            wrResponse = wrRequest.GetResponse()

                Using sr As New StreamReader(wrResponse.GetResponseStream())
                strOutput = sr.ReadToEnd()
'Close and clean up the StreamReader
sr.Close()
End Using
TextBox1.Text = strOutput
'Formatting Techniques
' Remove Doctype ( HTML 5 )
strOutput = Regex.Replace(strOutput, "<!(.|\s)*?>", "")

    ' 删除 HTML 标签 strOutput = Regex.Replace(strOutput, "</?[az][a-z0-9] [^<>] >", "")

    ' 删除 HTML 注释 strOutput = Regex.Replace(strOutput, "", "")

    ' 删除脚本标签 strOutput = Regex.Replace(strOutput, "<script.*?", "", RegexOptions.Singleline 或 RegexOptions.IgnoreCase)

    ' 删除样式表 strOutput = Regex.Replace(strOutput, "<style.*?", "", RegexOptions.Singleline 或 RegexOptions.IgnoreCase)

TextBox2.Text = strOutput 'write Formatted Output To Separate TB
Catch ex As Exception

Console.WriteLine(ex.Message, "Error")

End Try

End Sub

Private Sub BtnExtract_Click(sender As Object, e As EventArgs) Handles btnExtract.Click

Scrape() 'Scrape Text From URL

End Sub

End Class

标签: data-extraction

解决方案


推荐阅读