首页 > 解决方案 > 为什么我的网站抓取不适用于逗号前有 3 位或更多位的值?

问题描述

我的 Python 程序从网站获取共享值,但当值高于 99 时不起作用。我不明白错误消息。该网站是一个 HTML,我可以在其中获取表格的值。

def loadFromWebsite(company,ISIN):

    # counter to load past 4 weeks
    count = 28

    # generating URL to website
    URL = "https://www.boerse.de/historische-kurse/{0}-Aktie/{1}".format (company, ISIN)

    shareValues = []

    # getting onto website
    response = requests.get(URL)

    # get current date
    date_object = datetime.datetime.now()

    # website lists values starting from previous day
    start = date_object + datetime.timedelta(days = -1)

    previous = start

    # counting 4 weeks
    while count > 0:

        # Market is closed on saturdays (5) and sundays (6)
        if previous.weekday() < 5:

            # storing content of page
            src = response.content

            # create BeatifulSoup Object based on src
            soup = BeautifulSoup(src, 'html.parser')

            tables = soup.find_all("table")

            for table in tables:
                if start.strftime('%d.%m.%y') in table.text:
                    df = pd.read_html(str(table))[0]

                    # get row of the requested date
                    row = df[df['Datum']== previous.strftime('%d.%m.%y')].reset_index()

                    # add value of share (beginning of day) to values array
                    value = (row.loc[0,'Erster Schluss'].split()[0]).replace(',','.')
                    print(value)

                    # add date of the current value to date array
                    date = (previous.strftime('%d.%m.%y'))
                    test = []
                    test.append(value)
                    test.append(date)

                    # save value and date into list
                    shareValues.append(test)
        count = count - 1

        previous = previous + datetime.timedelta(days = -1)

loadFromWebsite("TecDax","DE0007203275")

错误信息:

  Traceback (most recent call last):
      File "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexes\range.py", line 350, 
    in get_loc
        return self._range.index(new_key)
    ValueError: 0 is not in range

    During handling of the above exception, another exception occurred:

    Traceback (most recent call last):
      File "c:/Users/hendr/Documents/Python_Projects/loadShareValues.py", line 99, in <module>
        loadFromWebsite("TecDax","DE0007203275")
      File "c:/Users/hendr/Documents/Python_Projects/loadShareValues.py", line 82, in loadFromWebsite
        value = (row.loc[0,'Erster Schluss'].split()[0]).replace(',','.')
      File "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexing.py", line 1762, in __getitem__
        return self._getitem_tuple(key)
      File 
    "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexing.py", line 1272, in _getitem_tuple
        return self._getitem_lowerdim(tup)
      File 
    "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexing.py", line 1389, in _getitem_lowerdim
        section = self._getitem_axis(key, axis=i)
      File 
    "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexing.py", line 1965, in _getitem_axis
        return self._get_label(key, axis=axis)
      File 
    "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexing.py", line 625, in _get_label
        return self.obj._xs(label, axis=axis)
      File 
    "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 3537, in xs
        loc = self.index.get_loc(key)
      File 
    "C:\Users\hendr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\indexes\range.py", line 352, 
    in get_loc
        raise KeyError(key)
    KeyError: 0
  row:    
  index     Datum Erster Schluss  Hoch Tief Tief Schluss Volumen
  0      0  18.03.20      4,95 5,10  5,45 4,91  491     510    2.197.259

标签: pythonhtmlpython-3.xpandasweb-scraping

解决方案


问题在于解析 date = we want '18.03.2020',而它正在寻找'18.03.20'. 所以它会出现一个空行,然后您尝试拆分None并获取 0 索引。

所以只要改变你看到'%d.%m.%y'的地方'%d.%m.%Y'

def loadFromWebsite(company,ISIN):

    # counter to load past 4 weeks
    count = 28

    # generating URL to website
    URL = "https://www.boerse.de/historische-kurse/{0}-Aktie/{1}".format (company, ISIN)

    shareValues = []

    # getting onto website
    response = requests.get(URL)

    # get current date
    date_object = datetime.datetime.now()

    # website lists values starting from previous day
    start = date_object + datetime.timedelta(days = -1)

    previous = start

    # counting 4 weeks
    while count > 0:

        # Market is closed on saturdays (5) and sundays (6)
        if previous.weekday() < 5:

            # storing content of page
            src = response.content

            # create BeatifulSoup Object based on src
            soup = BeautifulSoup(src, 'html.parser')

            tables = soup.find_all("table")

            for table in tables:
                if start.strftime('%d.%m.%Y') in table.text:
                    df = pd.read_html(str(table))[0]

                    # get row of the requested date
                    row = df[df['Datum']== previous.strftime('%d.%m.%Y')].reset_index()

                    # add value of share (beginning of day) to values array
                    value = (row.loc[0,'Erster Schluss'].split()[0]).replace(',','.')
                    print(value)

                    # add date of the current value to date array
                    date = (previous.strftime('%d.%m.%Y'))
                    test = []
                    test.append(value)
                    test.append(date)

                    # save value and date into list
                    shareValues.append(test)
        count = count - 1

        previous = previous + datetime.timedelta(days = -1)

loadFromWebsite("TecDax","DE0007203275")

推荐阅读