首页 > 解决方案 > 来自 B3/BM&F Bovespa 的网页抓取

问题描述

我正在尝试从 BM&FBOVESPA 参考费率页面下载一些数据。

他们的网页是...

http://www.b3.com.br/en_us/market-data-and-indices/data-services/market-data/reports/derivatives-market/reference-prices/bm-fbovespa-reference-rates/

框架是......

http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp

这是我的代码,它给了我一个错误:Error in out[j + k, ] : subscript out of bounds

#URL which contains the data 
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp'

#Read the HTML from the URL
site <- read_html(url)

#Save the table as "list"
lista_tabela <- site %>%
  html_nodes("table") %>%
  html_table(fill = TRUE) 

#"list" to df
CurvaDI <- lista_tabela[[1]]

我无法更正此错误,只能从他们的网站下载表格并将其另存为 df。

另外,我正在尝试在一个代码中下载几个句点。如果有人可以提供帮助,我们会很高兴!

非常感谢!

标签: rweb-scrapingfinanceyield

解决方案


似乎 html 在原始源代码中是故意格式错误的,因此您必须在解析表格之前对其进行重组。以下使用一系列正则表达式来获取可解析的表:

library(rvest)
library(httr)
library(stringr)

url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'

html <- content(GET(url), as = "raw") %>% rawToChar(.)
html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')

data <- html[[1]] %>% read_html() %>% html_table(fill = TRUE) 

dataframe <- tail(data[[1]], -1)

print(dataframe)

这使 :

    Calendar Days ID x fixed rate ID x fixed rate
2               1            1.90            0.00
3               7            1.90            1.55
4               8            1.90            1.70
5               9            1.90            1.81
6              13            1.91            1.67
7              14            1.91            1.75
8              21            1.91            1.81
9              23            1.91            1.89
10             24            1.91            1.93
11             28            1.91            1.75
12             30            1.91            1.82
13             34            1.92            1.77
14             41            1.93            1.82
15             43            1.94            1.87
16             52            1.95            1.93
.................................................

要提交表单数据,您可以使用特定选项和日期格式构建 POST 请求。以下将获取选项并提示用户选择一个然后获取数据:

library(rvest)
library(httr)
library(stringr)

date <- as.Date("2020-10-07")

url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'

html <- content(GET(url), as = "raw") %>% rawToChar(.)

getData <- function(html){
    html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
    html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
    html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')

    body <- html[[1]] %>% read_html()
    table <- body %>% html_table(fill = TRUE) 

    if (length(table) > 0){
        dataframe <- tail(table[[1]], -1)
        return(list(data = dataframe, body = body))
    }
    return(list(data = NULL, body = body))
}

res <- getData(html)
print(res[[1]])

options <- res[[2]] %>% html_nodes("option")
i <- 1
optionList = list()
for(o in options){
    optionList[[i]] <- c(
        key = o %>% html_attr("value"), 
        value = str_replace_all(o %>% html_text(),'\r\n','')
    )
    print(paste("[",i,"] ", optionList[[i]]["value"], sep=""))
    i <- i + 1
}
cat("Choose option by index : ")
selected <- readLines("stdin",n=1);
selectedOption <- optionList[[as.integer(selected)]]
print(paste("you selected :", selectedOption["value"], sep=" "))

postUrl <- modify_url(url, 
    query = list(
        Data = format(date, format="%m/%d/%Y"), 
        Data1 = format(date, format="%Y%m%d"), 
        slcTaxa = selectedOption["key"]
    )
)
html <- content(POST(postUrl, body = list(
    Data = format(date, format="%m/%d/%Y"), 
    Data1 = format(date, format="%Y%m%d"), 
    slcTaxa = selectedOption["key"],
    nomexls = "",
    lQtdTabelas = "",
    IDIOM =  2
), encode = "form"), as = "raw") %>% rawToChar(.)

res <- getData(html)
print(res[[1]])

推荐阅读