首页 > 解决方案 > 为什么在R中创建数据框时数据重复?

问题描述

我是 R 新手,现在正在尝试使用 R 进行网络抓取。我得到了我需要的所有信息,并希望将它们组合成一个数据框。但是当我编译信息时,我发现我的信息都是重复的。这是我的输出代码:

library(xml2)
library(rvest)
library(stringr)
url<-'https://www.amazon.in/OnePlus-Midnight-Black-256GB-Storage/dp/B077PWBC6V/ref=dp_prsubs_1?pd_rd_i=B077PWBC6V&psc=1'
webpage<- read_html(url)
title_html<- html_nodes(webpage,'h1#title')
title<-html_text(title_html)
head(title)
#> [1] "\n\n\n\n\n\n\n\n\nOnePlus 6 (Midnight Black, 8GB RAM, 256GB Storage)\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
str_replace_all(title,"[\r\n]","")
#> [1] "OnePlus 6 (Midnight Black, 8GB RAM, 256GB Storage)"
price_html<-html_nodes(webpage,'span#priceblock_ourprice')
price<-html_text(price_html)
str_replace_all(price,"[\r\n]","")
#> [1] "\u20b9 43,999.00"
desc_html <- html_nodes(webpage, 'div#productDescription')
desc <- html_text(desc_html)
desc<-str_replace_all(desc,"[\r\n\t]","")
desc<-str_trim(desc)
head(desc)
#> [1] "Size name:256GB | Colour:blackThe OnePlus 6 comes with a 19:9 Full Optic AMOLED display, 20+16 MP dual primary camera, 6/8 GB of RAM; up to 256 GB memory, Snapdragon 845 processor and much more"
rate_html<-html_nodes(webpage,'span#acrPopover')
rate<-html_text(rate_html)
rate<- str_replace_all(rate,"[\r\n]","")
rate<- str_trim(rate)
head(rate)
#> [1] "4.6 out of 5 stars" "4.6 out of 5 stars"
size_html<- html_nodes(webpage,'div#variation_size_name')
size_html<-html_nodes(size_html, 'span.selection')
size<- html_text(size_html)
size<-str_trim(size)
head(size)
#> [1] "256GB"
color_html <- html_nodes(webpage, 'div#variation_color_name')
color_html <- html_nodes(color_html, 'span.selection')
color <- html_text(color_html)
color <- str_trim(color)
head(color)
#> [1] "black"
product_data <- data.frame(Title = title, Price = price,Description = desc, Rating = rate, Size = size, Color = color)
str(product_data)
#> 'data.frame':    2 obs. of  6 variables:
#>  $ Title      : chr  "\n\n\n\n\n\n\n\n\nOnePlus 6 (Midnight Black, 8GB RAM, 256GB Storage)\n\n\n\n\n\n\n\n\n\n\n\n\n\n" "\n\n\n\n\n\n\n\n\nOnePlus 6 (Midnight Black, 8GB RAM, 256GB Storage)\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
#>  $ Price      : chr  "<U+20B9> 43,999.00" "<U+20B9> 43,999.00"
#>  $ Description: chr  "Size name:256GB | Colour:blackThe OnePlus 6 comes with a 19:9 Full Optic AMOLED display, 20+16 MP dual primary "| __truncated__ "Size name:256GB | Colour:blackThe OnePlus 6 comes with a 19:9 Full Optic AMOLED display, 20+16 MP dual primary "| __truncated__
#>  $ Rating     : chr  "4.6 out of 5 stars" "4.6 out of 5 stars"
#>  $ Size       : chr  "256GB" "256GB"
#>  $ Color      : chr  "black" "black"

谁能告诉我我的问题是什么?还有为什么我的标题仍然存在\n\n,即使我已经删除了它?很抱歉在这里发布我的所有代码,因为我不确定我的错误在哪里......对不起,提前谢谢你!

标签: rdataframe

解决方案


您有重复的原因是因为费率信息有重复的行。我为您创建了一个辅助函数,因此您不必重复自己(DRY 原则)。在这里,我插入了一个电话以unique()确保您不会重复。正如加文在评论中指出的那样,您没有重新分配标题,这就是为什么\n' 保留在您的标题中的原因。让我知道以下是否有帮助。

library(xml2)
library(rvest)
library(stringr)
library(dplyr)
url <- paste(
  'https://www.amazon.in/OnePlus-Midnight-Black-256GB-Storage/dp',
  '/B077PWBC6V/ref=dp_prsubs_1?pd_rd_i=B077PWBC6V&psc=1',
  sep = ''
)
webpage <- read_html(url)

parseInfo <- function(webpage, node) {
  info <- webpage %>% 
    html_nodes(node) %>% 
    html_text() %>% 
    str_replace_all('[\r\n\t]', '') %>% 
    str_trim() %>% 
    unique()
  return(info)
}

title <- parseInfo(webpage, 'h1#title')
price <- parseInfo(webpage, 'span#priceblock_ourprice')
desc <- parseInfo(webpage, 'div#productDescription')
rate <- parseInfo(webpage, 'span#acrPopover')
size <- parseInfo(webpage, 'div#variation_size_name')
color <- parseInfo(webpage, 'div#variation_color_name')

product_data <- data.frame(
  Title = title,
  Price = price, 
  Description = desc,
  Rating = rate, 
  Size = size, 
  Color = color
)

推荐阅读