首页 > 解决方案 > 如何使用python从网页中获取所有可见对象的坐标

问题描述

我正在尝试渲染一个网站以从 HTML 页面获取人类可见对象的坐标。我可以使用 Selenium 获取特定对象的坐标,但我需要所有对象的坐标。主要问题是过滤有信息和透明的分区(我附上图片),我该怎么做? 在此处输入图像描述

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep, strftime
driver = webdriver.Chrome()
driver.maximize_window() # now screen top-left corner == browser top-left corner 
driver.get("http://stackoverflow.com/questions")
question = driver.find_element_by_link_text("Newest")
y_relative_coord = question.location['y']
browser_navigation_panel_height = driver.execute_script('return window.outerHeight - window.innerHeight;')
y_absolute_coord = y_relative_coord + browser_navigation_panel_height
x_absolute_coord = question.location['x']
print(x_absolute_coord, y_absolute_coord)

做了一些补充,但由于某种原因不想正常工作

import cv2
import numpy as np
from selenium import webdriver

driver = webdriver.Chrome()
driver.set_window_size(1920, 1080)
driver.get("http://stackoverflow.com/questions")
driver.save_screenshot("screenshot.png")
y_relative_coord = []
x_absolute_coord = []
contours = []
list_of_visible_elements = driver.find_elements_by_xpath( "//div[not(contains(@style,'display:none'))]")
for element in list_of_visible_elements:
    y_relative_coord = element.location['y']
    size = element.size
    w, h = size['width'], size['height']
    browser_navigation_panel_height = driver.execute_script('return window.outerHeight - window.innerHeight;')
    y_absolute_coord = y_relative_coord + browser_navigation_panel_height
    x_absolute_coord = element.location['x']
    x = [x_absolute_coord, y_absolute_coord, w, h]
    contours.append(x)
    
img = cv2.imread('screenshot.png')
result = img.copy()
for cntr in contours:
    x,y,w,h = cntr
    cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
cv2.imshow("bounding_box", result)
cv2.waitKey(0)
cv2.destroyAllWindows()

UPD 我对代码进行了一些改进,它似乎工作正常,但仍然不是很准确,如何改进结果?

import cv2
import os
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1920,1080")

driver = webdriver.Chrome(chrome_options=options)
driver.set_window_size(1920, 1080)
driver.get("https://stackoverflow.com/questions")
driver.save_screenshot("s.png")
y_relative_coord = []
x_absolute_coord = []
contours = []
list_of_visible_elements = driver.find_elements_by_xpath( "//div[not(contains(@style,'display:none'))]")
for element in list_of_visible_elements:
    y_relative_coord = element.location['y']
    size = element.size
    w, h = size['width'], size['height']
    browser_navigation_panel_height = driver.execute_script('return window.outerHeight - window.innerHeight;')
    y_absolute_coord = y_relative_coord + browser_navigation_panel_height
    x_absolute_coord = element.location['x']
    if x_absolute_coord !=0 and y_absolute_coord !=0 and w != 0 and h != 0 : 
        x = [x_absolute_coord, y_absolute_coord, w, h]
    
        contours.append(x)
    
img = cv2.imread('s.png')
result = img.copy()
for cntr in contours:
    x,y,w,h = cntr
    cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 1)
cv2.imshow("bounding_box", result)
cv2.waitKey(0)
cv2.destroyAllWindows()
try: 
    os.remove("s.png")
except: pass

在此处输入图像描述

标签: javascriptpythonhtmlseleniumparsing

解决方案


You can use driver.execute_script to run a recursive generator function in Javascript that will traverse all the visible DOM nodes and return only those coordinates which are within the browser window height:

from selenium import webdriver
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://stackoverflow.com/questions/tagged/python')
r = d.execute_script("""
function* get_nodes(root){
   var style = window.getComputedStyle(root)
   if (style.getPropertyValue('display') != 'none'){
       if (root.offsetTop <= (window.outerHeight - window.innerHeight)){
          var b_d = root.getBoundingClientRect()
          yield [b_d.x, b_d.y, b_d.width, b_d.height]
       }
       for (var i of root.childNodes){
          if (i.nodeType === 1){
             yield* get_nodes(i)
          }
       }
   }
}
return [...get_nodes(document.body)]
""")
print(r)

Output:

[[0, 0, 1200, 3519.9375], [0, 0, 1200, 0], [0, 0, 1200, 50], [0, 3, 1200, 47], [0, 3, 166, 47], [0, 3, 166, 47], [8, 9.5, 150, 30], [166, 10, 261.59375, 33], [168, 12, 68.359375, 29], [168, 12, 68.359375, 29], [240.359375, 12, 86.484375, 29], [240.359375, 12, 86.484375, 29], [330.84375, 12, 94.75, 29], [330.84375, 12, 94.75, 29], [427.59375, 9.703125, 628.9375, 33.59375], [439.59375, 9.703125, 604.9375, 33.59375], [439.59375, 9.703125, 604.9375, 33.59375], [1056.53125, 3, 143.46875, 47], [1056.53125, 10, 143.46875, 33], [1060.53125, 10, 59.453125, 33], [1123.578125, 10, 68.421875, 33], [1200, 26.5, 0, 0], [0, 50, 1200, 3147.9375], [0, 50, 164, 3147.9375], [0, 50, 164, 605], [0, 74, 154, 573.171875], [0, 74, 154, 573.171875], [0, 74, 154, 34], [0, 74, 154, 34], [8, 78, 142, 26], [8, 78, 35.765625, 26], [0, 124, 154, 221], [0, 124, 154, 221], [8, 124, 146, 14], [0, 142, 154, 33], [0, 142, 154, 33], [30, 150, 65.515625, 17], [8, 361, 146, 14], [0, 383, 154, 264.171875], [13, 396, 120.6875, 33], [13, 472, 129, 105.796875], [164, 50, 1036, 3147.9375], [189, 74, 663, 2952.1875], [189, 74, 663, 59], [189, 74, 547.984375, 35], [189, 74, 311.1875, 35], [748.984375, 74, 103.015625, 59], [748.984375, 74, 103.015625, 37.78125], [189, 133, 663, 121], [189, 133, 663, 85], [407.65625, 282.59375, 42.265625, 13], [470.109375, 282.59375, 35.015625, 13], [525.3125, 282.59375, 50.53125, 13], [579.84375, 282.59375, 24.109375, 13], [624.140625, 282.59375, 70.125, 13], [189, 318.1875, 663, 0], [189, 318.1875, 663, 243.1875], [190, 319.1875, 661, 182], [186, 319.1875, 669, 182], [202, 335.1875, 161.40625, 150], [202, 331.1875, 161.40625, 104], [202, 331.1875, 57.8125, 19], [202, 358.1875, 161.40625, 19], [200, 358.1875, 165.40625, 19], [202, 361.1875, 13, 13], [202, 361.1875, 13, 13], [219, 358.1875, 83.859375, 19], [202, 385.1875, 161.40625, 19], [200, 385.1875, 165.40625, 19], [202, 388.1875, 13, 13], [202, 388.1875, 13, 13], [219, 385.1875, 144.40625, 19], [202, 412.1875, 161.40625, 19], [200, 412.1875, 165.40625, 19], [202, 415.1875, 13, 13], [202, 415.1875, 13, 13], [219, 412.1875, 82.34375, 19], [395.40625, 335.1875, 157.53125, 150], [395.40625, 331.1875, 157.53125, 158], [395.40625, 331.1875, 69.796875, 19], [395.40625, 358.1875, 157.53125, 19], [393.40625, 358.1875, 161.53125, 19], [395.40625, 361.1875, 13, 13], [395.40625, 361.1875, 13, 13], [412.40625, 358.1875, 55.421875, 19], [395.40625, 385.1875, 157.53125, 19], [393.40625, 385.1875, 161.53125, 19], [395.40625, 388.1875, 13, 13], [395.40625, 388.1875, 13, 13], [412.40625, 385.1875, 105.40625, 19], [395.40625, 412.1875, 157.53125, 19], [393.40625, 412.1875, 161.53125, 19], [395.40625, 415.1875, 13, 13], [395.40625, 415.1875, 13, 13], [412.40625, 412.1875, 79.5625, 19], [395.40625, 439.1875, 157.53125, 19], [393.40625, 439.1875, 161.53125, 19], [412.40625, 439.1875, 100.5625, 19], [584.9375, 335.1875, 234.65625, 150], [584.9375, 331.1875, 234.65625, 77], [584.9375, 331.1875, 87.3125, 19], [584.9375, 358.1875, 234.65625, 19], [582.9375, 358.1875, 238.65625, 19], [584.9375, 361.1875, 13, 13], [584.9375, 361.1875, 13, 13], [601.9375, 358.1875, 121.03125, 19], [584.9375, 385.1875, 234.65625, 19], [582.9375, 385.1875, 238.65625, 19], [584.9375, 388.1875, 13, 13], [584.9375, 388.1875, 13, 13], [601.9375, 385.1875, 133.46875, 19], [608.9375, 412.1875, 210.65625, 37], [608.9375, 412.1875, 210.65625, 37], [611.9375, 421.6875, 73.171875, 16], [613.9375, 419.28125, 69.171875, 24], [662.109375, 423.28125, 16, 16], [685.109375, 415.1875, 23, 29], [708.109375, 421.6875, 0, 16], [189, 2942.1875, 0, 16], [876, 74, 300, 3084.9375], [876, 74, 300, 371], [877, 74, 298, 358], [877, 74, 298, 41], [877, 127, 298, 34], [893, 127, 22.15625, 34], [915.15625, 127, 243.84375, 34], [915.15625, 127, 208.8125, 33], [877, 173, 298, 51], [893, 173, 22.15625, 51], [915.15625, 173, 243.84375, 51], [915.15625, 173, 236.59375, 50], [877, 755, 298, 42], [889, 762, 274, 25], [889, 762, 67.984375, 25], [1163, 762, 0, 25], [877, 797, 298, 966], [877, 797, 298, 920], [877, 797, 298, 115], [877, 797, 298, 115], [889, 806, 40, 100], [889, 806, 32, 32], [929, 803, 234, 103], [929, 806, 234, 32], [929, 841, 234, 16], [929, 841, 33.234375, 15], [969.234375, 841, 86.09375, 16], [983.234375, 841, 72.09375, 16], [929, 860, 234, 16], [929, 860, 81.953125, 16], [929, 861, 81.953125, 13], [1017.953125, 860, 63.390625, 16], [1017.953125, 861, 63.390625, 13], [929, 879, 31.84375, 24], [962.84375, 879, 48.296875, 24], [929, 906, 234, 0], [1153, 803, 16, 16], [1071.640625, 776, 97.359375, 22], [877, 912, 298, 99], [889, 921, 40, 84], [889, 921, 32, 32], [929, 918, 234, 87], [929, 921, 234, 16], [929, 940, 234, 16], [929, 940, 57.96875, 15], [993.96875, 940, 114.015625, 16], [1007.96875, 940, 100.015625, 16], [929, 959, 234, 16], [929, 959, 78.109375, 16], [929, 960, 78.109375, 13], [1014.109375, 959, 63.390625, 16], [1014.109375, 960, 63.390625, 13], [929, 978, 48.296875, 24], [979.296875, 978, 51.125, 24], [929, 1005, 234, 0], [1153, 918, 16, 16], [1071.640625, 891, 97.359375, 22], [877, 1011, 298, 115], [889, 1020, 40, 100], [889, 1020, 32, 32], [929, 1017, 234, 103], [929, 1020, 234, 32], [929, 1055, 234, 16], [929, 1055, 82.40625, 15], [1018.40625, 1055, 114.015625, 16], [1032.40625, 1055, 100.015625, 16], [929, 1074, 234, 16], [929, 1074, 63.390625, 16], [929, 1075, 63.390625, 13], [929, 1093, 49.171875, 24], [980.171875, 1093, 32.234375, 24], [929, 1120, 234, 0], [1153, 1017, 16, 16], [1071.640625, 990, 97.359375, 22], [877, 1126, 298, 99], [889, 1135, 40, 84], [889, 1135, 32, 32], [929, 1132, 234, 87], [929, 1135, 234, 16], [929, 1154, 234, 16], [929, 1154, 58.796875, 15], [994.796875, 1154, 114.015625, 16], [1008.796875, 1154, 100.015625, 16], [929, 1173, 234, 16], [929, 1173, 81.984375, 16], [929, 1174, 81.984375, 13], [1017.984375, 1173, 63.390625, 16], [1017.984375, 1174, 63.390625, 13], [929, 1192, 51.125, 24], [982.125, 1192, 64.421875, 24], [929, 1219, 234, 0], [1153, 1132, 16, 16], [1071.640625, 1105, 97.359375, 22], [877, 1225, 298, 131], [889, 1234, 40, 116], [889, 1234, 32, 32], [929, 1231, 234, 119], [929, 1234, 234, 48], [929, 1285, 234, 16], [929, 1285, 82.921875, 15], [1018.921875, 1285, 114.015625, 16], [1032.921875, 1285, 100.015625, 16], [929, 1304, 234, 16], [929, 1304, 78.5625, 16], [929, 1305, 78.5625, 13], [1014.5625, 1304, 63.390625, 16], [1014.5625, 1305, 63.390625, 13], [929, 1323, 51.125, 24], [982.125, 1323, 67.234375, 24], [1153, 1231, 16, 16], [1071.640625, 1204, 97.359375, 22], [877, 1356, 298, 115], [889, 1365, 40, 100], [889, 1365, 32, 32], [929, 1362, 234, 103], [929, 1365, 234, 32], [929, 1400, 234, 16], [929, 1400, 52.46875, 15], [988.46875, 1400, 114.015625, 16], [1002.46875, 1400, 100.015625, 16], [929, 1419, 234, 16], [929, 1419, 77.3125, 16], [929, 1420, 77.3125, 13], [1013.3125, 1419, 63.390625, 16], [1013.3125, 1420, 63.390625, 13], [929, 1438, 48.984375, 24], [979.984375, 1438, 67.234375, 24], [929, 1465, 234, 0], [1153, 1362, 16, 16], [1071.640625, 1335, 97.359375, 22], [877, 1471, 298, 131], [889, 1480, 40, 116], [889, 1480, 32, 32], [929, 1477, 234, 119], [929, 1480, 234, 32], [929, 1515, 234, 32], [929, 1515, 150.71875, 15], [929, 1531, 95.8125, 16], [943, 1531, 81.8125, 16], [929, 1550, 234, 16], [929, 1550, 63.390625, 16], [929, 1551, 63.390625, 13], [999.390625, 1550, 88.953125, 16], [999.390625, 1551, 88.953125, 13], [929, 1569, 48.296875, 24], [979.296875, 1569, 64.421875, 24], [1153, 1477, 16, 16], [1071.640625, 1450, 97.359375, 22], [877, 1602, 298, 109], [889, 1611, 32, 32], [929, 1608, 234, 103], [929, 1611, 234, 16], [929, 1630, 234, 32], [929, 1630, 217.78125, 15], [929, 1646, 79.0625, 16], [943, 1646, 65.0625, 16], [929, 1665, 234, 16], [929, 1665, 63.390625, 16], [929, 1666, 63.390625, 13], [929, 1684, 49.171875, 24], [980.171875, 1684, 48.296875, 24], [929, 1711, 234, 0], [1153, 1608, 16, 16], [1071.640625, 1581, 97.359375, 22], [0, 0, 1200, 655], [487.9937438964844, 341.8999938964844, 223.99685668945312, 18.600006103515625], [487.9937438964844, 368.8999938964844, 223.99685668945312, 55.556243896484375], [487.9937438964844, 370.1000061035156, 223.99685668945312, 30.600006103515625], [487.9937438964844, 370.1000061035156, 223.99685668945312, 30.600006103515625], [489.1937561035156, 382.70001220703125, 221.59689331054688, 18], [487.9937438964844, 403.1000061035156, 223.99685668945312, 20.15625], [697.5906372070312, 332.29998779296875, 24, 24.600006103515625], [665.859375, 3229.9375, 198, 171], [665.859375, 3229.9375, 198, 34], [665.859375, 3229.9375, 121.0625, 33], [786.921875, 3229.9375, 0, 16], [665.859375, 3275.9375, 198, 125], [665.859375, 3275.9375, 198, 25], [665.859375, 3275.9375, 87.84375, 25], [665.859375, 3300.9375, 198, 25], [665.859375, 3300.9375, 76.75, 25], [665.859375, 3325.9375, 198, 25], [665.859375, 3325.9375, 138.375, 25], [665.859375, 3350.9375, 198, 25], [665.859375, 3350.9375, 65.9375, 25], [0, 0, 0, 0], [0, 50, 3.59375, 17]]

推荐阅读