python - 如何使用 Asyncio 避免错误 429 (Too Many Requests) python
问题描述
我正在使用以下代码向 aiohttp 客户端发出请求。我尝试发送请求的服务器每个 IP 每小时的请求限制为 30k。所以我收到 429 too many request 错误。我想在工作达到极限时让工作进入睡眠状态。
我可以从标题中提取 x_rateLimit_reset,所以我想我可以用它来让工作进入睡眠状态,但我观察到了非常奇怪的行为。有时工作的睡眠时间会变成负数,有时它会陷入睡眠模式。
例如,我上次运行该作业时,它首先休眠了 2000 秒,然后经过一段时间后,它再次尝试再休眠 2500 秒并陷入休眠模式。我想也许是其他并行进程导致了这个问题,所以想知道在使用 Asyncio 时如何处理太多的请求错误消息。
@backoff.on_exception(backoff.expo, (asyncio.TimeoutError, aiohttp.client_exceptions.ServerDisconnectedError,TooManyRequests),
max_time=300)
async def fetch(self, url, session, params):
try:
async with session.get(url, params=params) as response:
now = int(time.time())
print(response)
output = await response.read()
output = json.loads(output)
if 'X-RateLimit-Remaining' in response.headers:
rate = response.headers['X-RateLimit-Remaining']
if 'status' in output and output['status'] == 429:
x_rateLimit_reset = int(response.headers['X-RateLimit-Reset'])
print("sleep mode")
seconds = x_rateLimit_reset - now
LOGGER.info("The job will sleep for {} seconds".format(seconds))
time.sleep(max(seconds,0))
raise TooManyRequests()
return output
except (asyncio.TimeoutError, TypeError, json.decoder.JSONDecodeError,
aiohttp.client_exceptions.ServerDisconnectedError) as e:
print(str(e))
async def bound_fetch(self, sem, url, session, params):
# Getter function with semaphore.
async with sem:
output = await self.fetch(url, session, params)
return {"url": url, "output": output}
编辑:这就是我启动 bound_fetch 并定义 URL 的方式:
def get_responses(self, urls, office_token, params=None):
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(self.run(office_token, urls, params))
responses = loop.run_until_complete(future)
return responses
async def run(self, office_token, urls, params):
tasks = []
# create instance of Semaphore
sem = asyncio.BoundedSemaphore(200)
timeout = ClientTimeout(total=1000)
async with ClientSession(auth=BasicAuth(office_token, password=' '), timeout=timeout,
connector=TCPConnector(ssl=False)) as session:
for url in urls:
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(self.bound_fetch(sem, url, session, params))
tasks.append(task)
responses = await asyncio.gather(*tasks)
return responses
urls = [
"{}/{}".format(self.base_url, "{}?page={}&api_key={}".format(object_name, page_number, self.api_keys))
for page_number in range(batch * chunk_size + 1, chunk_size * (1 + batch) + 1)]
解决方案
time.sleep()
您改为使用的主要原因await asyncio.sleep()
。
更新
这是最小的工作解决方案和一些评论它是如何工作的。
请使用它来采用您的解决方案。
import aiohttp
import asyncio
from datetime import datetime
async def fetch(session, task): # fetching urls and mark result of execution
async with session.get(task['url']) as response:
if response.status != 200:
# response.raise_for_status()
# Here you need to somehow handle 429 code if it acquired
# In my example I just skip it.
task['result'] = response.status
task['status'] = 'done'
await response.text() # just to be sure we acquire data
print(f"{str(datetime.now())}: Got result of {task['url']}") # logging
task['result'] = response.status
task['status'] = 'done'
async def fetch_all(session, urls, persecond):
# convert to list of dicts
url_tasks = [{'url': i, 'result': None, 'status': 'new'} for i in urls]
n = 0 # counter
while True:
# calc how many tasks are fetching right now
running_tasks = len([i for i in url_tasks if i['status'] in ['fetch']])
# calc how many tasks are still need to be executed
is_tasks_to_wait = len([i for i in url_tasks if i['status'] != 'done'])
# check we are not in the end of list n < len()
# check we have room for one more task
if n < len(url_tasks) and running_tasks < persecond:
url_tasks[n]['status'] = 'fetch'
#
# Here is main trick
# If you schedule task inside running loop
# it will start to execute sync code until find some await
#
asyncio.create_task(fetch(session, url_tasks[n]))
n += 1
print(f'Schedule tasks {n}. '
f'Running {running_tasks} '
f'Remain {is_tasks_to_wait}')
# Check persecond constrain and wait a sec (or period)
if running_tasks >= persecond:
print('Throttling')
await asyncio.sleep(1)
#
# Here is another main trick
# To keep asyncio.run (or loop.run_until_complete) executing
# we need to wait a little than check that all tasks are done and
# wait and so on
if is_tasks_to_wait != 0:
await asyncio.sleep(0.1) # wait all tasks done
else:
# All tasks done
break
return url_tasks
async def main():
urls = ['http://google.com/?1',
'http://google.com/?2',
'http://google.com/?3']*3
async with aiohttp.ClientSession() as session:
res = await fetch_all(session, urls, 3)
print(res)
if __name__ == '__main__':
asyncio.run(main())
# (asyncio.run) do cancel all pending tasks (we do not have them,
# because we check all task done)
# (asyncio.run) do await canceling all tasks
# (asyncio.run) do stop loop
# exit program
推荐阅读
- python - Pandas dataframe.plot 与 matplotlib.pyplot 不匹配
- django - Django:如何在不引用文件目标的情况下附加文件?
- elasticsearch - Elasticsearch 上的单个查询中的多个聚合
- android - 为什么我的 TextView 在 LinearLayout 中的先前 TextView 中以长文本不可见?
- jenkins - 推送 bitbucket 服务器后触发管道
- google-cloud-platform - 防火墙规则名称与命令中提供的不同
- javascript - 如何将文本文件中的数据读取到变量中,以便可以根据值更改颜色?
- json - 更新 hashcorp Vault 中的单个值
- github - 如何使用 Github API 检索组织的存储库?
- javascript - 如何通过表单外的按钮提交表单?