首页 > 解决方案 > 清理 DataFrame 输出 | 硒 | Python

问题描述

我已经设置了一个脚本来遍历一组 URL。该脚本运行良好,但我不知道如何调整以产生更清晰的 CSV 输出。

我会尽我所能,尽量减少清理格式、删除 Excel 单元格等所需的时间。

注意:我刮卷文本的方式是我弄清楚如何获得所需内容的唯一方法。希望我们能找到一个好的解决方案来改进最终输出,而不会影响脚本的这一部分。

这是我的脚本:

group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]

data = []

for group in group_url:
    driver.get(group)
    wait = WebDriverWait(driver, 90)
    element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
    time.sleep(3)

    kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
    counter = 1
    for kw in kws:
        if counter <= 5:
            try:
                data.append({
                    "Keyword": kw.text
                })
                counter = counter + 1
            except NoSuchElementException:
                pass

    urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
    count = 1
    for url in urls:
        if count <= 5:
            try:
                data.append({
                    "URL": url.text
                })
                count = count + 1
            except NoSuchElementException:
                pass

    try:
        vol1 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        data.append({
            "Volume1": vol1.text
        })
    try:
        vol2 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        data.append({
            "Volume2": vol2.text
        })
    try:
        vol3 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        data.append({
            "Volume3": vol3.text
        })
    try:
        vol4 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        data.append({
            "Volume4": vol4.text
        })
    try:
        vol5 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        data.append({
            "Volume5": vol5.text
        })

driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')

这是最终输出的屏幕截图: 最终输出截图

标签: pythondataframeselenium

解决方案


您将每个行项目独立附加到data. 首先在 for 循环中的字典中收集它们,然后将字典附加到列表中data

group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]

data = []

for group in group_url:
    tmp_dict = {}
    
    driver.get(group)
    wait = WebDriverWait(driver, 90)
    element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
    time.sleep(3)

    kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
    counter = 1
    keywords = []
    for kw in kws:
        if counter <= 5:
            try:
                keywords.append(kw.text)
                counter = counter + 1
            except NoSuchElementException:
                pass
    tmp_dict["Keyword"] = keywords    
    urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
    count = 1
    urls_results = []
    for url in urls:
        if count <= 5:
            try:
                urls_results.append(url.text)
                count = count + 1
            except NoSuchElementException:
                pass
    tmp_dict["URL"] = urls_results

    try:
        vol1 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        tmp_dict["Volume1"]= vol1.text
    try:
        vol2 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        tmp_dict["Volume2"]= vol2.text
    try:
        vol3 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        tmp_dict["Volume3"]= vol3.text
    try:
        vol4 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        tmp_dict["Volume4"]= vol4.text
    try:
        vol5 = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
    except NoSuchElementException:
        pass
    else:
        tmp_dict["Volume5"]= vol5.text
    data.append(tmp_dict)
        
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')

推荐阅读