在VBA中进行网页刮擦表

我已经成功地导航到了包含我希望提取数据的表的页面。让我先声明一下,这是我第一次尝试这样的事情,我对自己能走到这一步感到非常高兴。我导航到我想要的网页,更新了用户名、密码,然后导航到包含我感兴趣的表的页面。

现在我试图从表中提取数据,我得到了以下错误,见下文。

enter image description here

我的代码如下:

'==============================================================
'
Public IE                   As New SHDocVw.InternetExplorer
'==============================================================
'                           HTML DOCUMENT
'
Public HTMLDoc              As MSHTML.HTMLDocument
'==============================================================
'                           BUTTON COLLECTION
'
Public HTMLButtons          As MSHTML.IHTMLElementCollection
Public HTMLButton           As MSHTML.IHTMLElement
'==============================================================
'                           ATTRIBUTE COLLECTION
'
Public HTMLAs               As MSHTML.IHTMLElementCollection3
Public HTMLA                As MSHTML.IHTMLElement3
'==============================================================
'                           TABLE COLLECTION
'
Public HTMLTables           As MSHTML.IHTMLElementCollection
Public HTMLTable            As MSHTML.IHTMLElement
'==============================================================
'                           TABLE ELEMENTS
'
Public TableBody            As MSHTML.IHTMLElementCollection2
Public TableRows            As MSHTML.IHTMLElementCollection3
Public TableCell            As MSHTML.IHTMLElementCollection4
'==============================================================
Public RowNum               As Long
Public ColNum               As Long
'==============================================================
'
'
Public Sub TableCollection()

Worksheets.Add
RowNum = 1
Set TableBody = HTMLDoc.getElementsByTagName("tbody")
Set TableRows = HTMLDoc.getElementsByTagName("tr")
Set TableCell = HTMLDoc.getElementsByTagName("td")
For Each TableRows In TableBody
    ColNum = 1
    For Each TableCell In TableRows
        Cells(RowNum, ColNum).Value = TableCell.innerText
        ColNum = ColNum + 1
    Next TableCell
RowNum = RowNum + 1
Next TableRows
End Sub

下面是一个表头和一个元素,我正试图刮取.我已经替换了URL的WEBADDRESS。

<html><head>
  <title>
    Transaction SpreadSheet for the Current Month to date - April 2020</title>
</head>
<body>
<style>
  td { font-family:arial,verdana,sans-serif;font-size:12px;color:#000000;line-height:16px;}
</style>
<table cellpadding="2">
  <tbody>
  <tr>
    <td>
      <b>Date</b>
    </td>
    <td>
      <b>Reference</b>
    </td>
        <td>
      <b>Item</b>
    </td>
    <td>
      <b>Particulars</b>
    </td>
    <td>
      <b>Buyer</b>
    </td>
        <td>
      <b>Order Id</b>
    </td>
    <td>
      <b>Note</b>
    </td>
    <td>
      <b>Transaction Amount</b>
    </td>
   </tr>
<tr>
  <td>
    04&nbsp;Apr&nbsp;2020</td>
    <td>
    239137532</td>  
  <td>
    <a href="https://WEBADDRESS">461619577</a></td>
  <td>
    Success Fee</td>
  <td>
  <a title="User profile for Joe" href="WEBADDRESS">RoySch2510</a>
  </td>
    <td>
  <a href="https://WEBADDRESS" rel="nofollow,noindex">17314294</a>
  </td>
  <td>
    </td>   
  <td>
    -62.55</td>
  </tr>
<tr>

请教我做错了什么

好了,这是我所有的代码,希望能给大家更多的启示。

Option Explicit

Public Sub GetHTMLDocument()
'===========================================================================
'                         ESTABLISH PUBLIC VARIABLES
'
Call PublicHTMLVariables
'===========================================================================
'                              NAVIGATE TO IE
'
Call NavigateToIE("https://old.bidorbuy.co.za/jsp/login/UserLogin.jsp")
'===========================================================================
'                                   LOGIN
'
Call LoginToWebsite("JoeCam9517", "********")
'===========================================================================
'                           NAVIGATE TO 1st PAGE
'
Call NavigateToFirstPage
'===========================================================================
'                      NAVIGATE TO ACCOUNT HISTORY PAGE
'
Call NavigateToAccountsPage
'===========================================================================
'               CHANGE THE DATE RANGE FOR TRANSACTION SELECTION
'
'Call ChangeDateRange
'===========================================================================
'                      NAVIGATE TO ACCOUNT TABLE PAGE
'
Call NavigateToTablesPage
'===========================================================================
'                     COLLECT TABLE ELEMENTS TO WORKSHEET
'
Call TableCollection
'===========================================================================

MsgBox "Pause"
'                       MORE CODE STILL TO BE DEVELOPED

End Sub

PUBLIC VARIABLES

Option Explicit
'==============================================================
'
Public IE                   As New SHDocVw.InternetExplorer
'==============================================================
'                           HTML DOCUMENT
'
Public HTMLDoc              As MSHTML.HTMLDocument
'==============================================================
'                           HTML ELEMENTS
'
Public HTMLInput            As MSHTML.IHTMLElement
Public FromDay              As MSHTML.IHTMLElement
Public FromYearMonth        As MSHTML.IHTMLElement
Public ToDay                As MSHTML.IHTMLElement
'==============================================================
'                           BUTTON COLLECTION
'
Public HTMLButtons          As MSHTML.IHTMLElementCollection
Public HTMLButton           As MSHTML.IHTMLElement
'==============================================================
'                           ATTRIBUTE COLLECTION
'
Public HTMLAs               As MSHTML.IHTMLElementCollection3
Public HTMLA                As MSHTML.IHTMLElement3
'==============================================================
'                           TABLE COLLECTION
'
Public HTMLTable            As MSHTML.IHTMLElement
Public HTMLTableRows        As MSHTML.IHTMLElementCollection
Public HTMLTableCells       As MSHTML.IHTMLElementCollection
'==============================================================
'                           DATE ELEMENTS
'
Public ToYearMonth          As MSHTML.IHTMLElement
'==============================================================
'                           TABLE ELEMENTS
'
'Public TableBody            As MSHTML.IHTMLElementCollection2
'Public TableRows            As MSHTML.IHTMLElementCollection3
'Public TableCell            As MSHTML.IHTMLElementCollection4
'==============================================================
Public H                    As Integer
Public RowNum               As Long
Public ColNum               As Long
'==============================================================

Public Sub PublicHTMLVariables()

End Sub

导航到网页

Option Explicit

Public Sub NavigateToIE(Destination As String)
IE.Visible = True
IE.Navigate Destination
Do Until IE.ReadyState = 4
    DoEvents
Loop
End Sub

准备登录

Option Explicit
Public Sub LoginToWebsite(UserID As String, PassWord As String)
Set HTMLDoc = IE.Document
Set HTMLInput = HTMLDoc.getElementById("username")
    HTMLInput.Value = UserID
Set HTMLInput = HTMLDoc.getElementById("password")
    HTMLInput.Value = PassWord
End Sub

导航到第一页

Option Explicit
'===========================================================================
'
'
Public Sub NavigateToFirstPage()
Set HTMLButtons = HTMLdoc.getElementsByTagName("button")
HTMLButtons(3).Click
Do While IE.ReadyState = 4: DoEvents: Loop
Do Until IE.ReadyState = 4: DoEvents: Loop
End Sub

导航到账户历史页面

Option Explicit

'===========================================================================
'                      NAVIGATE TO ACCOUNT HISTORY PAGE
'
Public Sub NavigateToAccountsPage()
H = 0
Set HTMLAs = HTMLdoc.getElementsByTagName("a")
For Each HTMLA In HTMLAs
    If HTMLA.href = "https://old.bidorbuy.co.za/jsp/fee/UserAccount.jsp" Then
        GoTo ButtonFound
    End If
    H = H + 1
Next HTMLA
ButtonFound:
HTMLAs(H).Click
Do While IE.ReadyState = 4: DoEvents: Loop
Do Until IE.ReadyState = 4: DoEvents: Loop
End Sub

更改日期范围–不工作–我将在以后的日期寻求帮助

导航到表格页面

Option Explicit
 

'=========================================================================
'
'                      NAVIGATE TO ACCOUNT TABLE PAGE
'
Public Sub NavigateToTablesPage()
Set HTMLButtons = HTMLdoc.getElementsByName("DetailSubmit")
HTMLButtons(1).Click
End Sub

这就涉及到了我有问题的程序。

Option Explicit
'===========================================================================
'
'
Public Sub TableCollection()
Worksheets.Add

Dim HTMLdoc         As New HTMLDocument
Dim trow            As Object
Dim tcel            As Object
Dim rowNum          As Long
Dim colNum          As Long

rowNum = 1

For Each trow In HTMLdoc.getElementsByTagName("tbody")(0).getElementsByTagName("tr")
    colNum = 1
    For Each tcel In trow.getElementsByTagName("td")
        Cells(rowNum, colNum).Value = tcel.innerText
        colNum = colNum + 1
    Next tcel
    rowNum = rowNum + 1
Next trow
End Sub

'Set HTMLTable = HTMLDoc.getElementsByTagName("body")
'Set HTMLTableRows = HTMLdoc.getElementsByTagName("tr")
'Set HTMLTableCells = HTMLdoc.getElementsByTagName("td")
'For Each HTMLTableCells In HTMLTableRows
'Debug.Print HTMLTableRows.innerText
'Next HTMLTableCells
'    ColNum = 1
'    For Each TableCell In TableRows
'        Cells(RowNum, ColNum).Value = TableCell.innerText
'        ColNum = ColNum + 1
'    Next TableCell
'RowNum = RowNum + 1
'Next TableRows

我知道有很多别人的代码要看,但是我在写代码的时候,也会考虑到别人可能要编辑它。另外,我很抱歉我没有遵循正常的惯例,但是当我看到一个变量以小写字母开始,然后在一半的时候你得到了一个大写字母,这让我很不舒服,这看起来很不优雅,抱歉:-)

我开始怀疑问题出在表的构造方式上,有可能吗?

在此,我想对所有试图解决我问题的人说声谢谢,但我还是停留在同样的结果上。使用上面的代码,我通到了这个表。HTMLTable然后我得到了这个错误。HTMLError

正如你从注释出来的代码中看到的,我已经尝试了几种不同的编码选项,但我一直得到一个错误。

解决方案:

我已经写了一些函数来读取任何HTML表格。试着使用它。HTMLTab作为函数的参数,当然必须是HTMLTable IHTMLTable对象。)

Function ReadTable(HTMLTab) As Variant
Dim myTable() As Variant

  rLen = HTMLTab.Rows.Length
  CLen = HTMLTab.Cells.Length / rLen
  ReDim myTable(0 To rLen - 1, 0 To CLen - 1)

  For Each myRow In HTMLTab.Rows
    j = 0
    For Each myCell In myRow.Cells
      myTable(i, j) = myCell.outerText
      j = j + 1
    Next myCell
    i = i + 1
  Next myRow

  ReadTable = myTable

End Function

给TA打赏
共{{data.count}}人
人已打赏
未分类

Xamarin.iOS中的iOS选择器

2022-9-9 1:28:32

未分类

获得LOCATIONS_ERROR_SERVICE_NOT_AVAILABLE for gps after about an hour in tizen wearable application.

2022-9-9 1:28:34

0 条回复 A文章作者 M管理员
    暂无讨论,说说你的看法吧
个人中心
购物车
优惠劵
今日签到
有新私信 私信列表
搜索