python - 检查文件是否已处理的最佳方法
问题描述
E:我最初的标题非常具有误导性。
我有一个带数据库的 SQL 服务器,一个目录中有大约 10,000 个 excel 文件。这些文件包含我需要复制到数据库中的值,并且每天都会添加新的 excel 文件。此外,每个文件都包含一个带有布尔值的“已完成”字段,表示文件是否已准备好复制到数据库。但是,文件名与它的竞争无关。只有文件的内容包含与数据库的键和字段名对应的主键和文件名。
通过反复比较主键来检查文件的内容是否已经在数据库中是不可行的,因为打开文件太慢了。但是,我最初可以检查文件是否已经在数据库中,并将结果写入文件(例如copyed.txt),因此它只保存所有已复制文件的文件名。然后,真正的服务可以将此文件的内容加载到字典(dict1)中,文件名作为键,没有值(我认为哈希表对于比较操作来说是最快的),然后将所有现有 excel 文件的文件名存储在目录中在第二个字典(dict2)中比较两个字典并创建一个列表,列出所有在 dict2 中但不在 dict1 中的文件。然后我会遍历列表(通常应该只包含大约 10-20 个文件),检查文件是否被标记为“
我的想法是将这个 python 脚本作为服务运行,只要有文件可以使用,它就会循环。当它找不到要复制的文件时,它应该等待 x 秒(可能是 45 秒)而不是全部完成。
这是我迄今为止最好的概念。有更快/更有效的方法吗?
解决方案
我突然想到,集合只包含独特的元素,因此是进行此类比较的最佳数据类型。这是一种我几乎不知道的数据类型,但现在我可以看到它有多么有用。
与我的原始问题相关的代码部分在第 1-3 部分中: 程序: 1. 将文件名从文件加载到集合 2. 从文件系统/某个目录 + 子目录加载文件名到集合(一一) 5. 将处理后的文件名添加到文件名文件中。
它每 5 分钟执行一次。这对我的目的来说完全没问题。
我对编码很陌生,很抱歉我的业余方法。至少到目前为止它有效。
#modules
import pandas as pd
import pyodbc as db
import xlwings as xw
import glob
import os
from datetime import datetime, date
from pathlib import Path
import time
import sys
#constants
tick_time_seconds = 300
line = ("################################################################################### \n")
pathTodo = "c:\\myXlFiles\\**\\*"
pathDone = ("c:\\Done\\")
pathError = ("c:\\Error\\")
sqlServer = "MyMachine\\MySQLServer"
sqlDriver = "{SQL Server}"
sqlDatabase="master"
sqlUID="SA"
sqlPWD="PWD"
#functions
def get_list_of_files_by_extension(path:str, extension:str) -> list:
"""Recieves string patch and extension;
gets list of files with corresponding extension in path;
return list of file with full path."""
fileList = glob.glob(path+extension, recursive=True)
if not fileList:
print("no found files")
else:
print("found files")
return fileList
def write_error_to_log(description:str, errorString:str, optDetails=""):
"""Recieves strings description errorstring and opt(ional)Details;
writes the error with date and time in logfile with the name of current date;
return nothing."""
logFileName = str(date.today())+".txt"
optDetails = optDetails+"\n"
dateTimeNow = datetime.now()
newError = "{0}\n{1}\n{2}{3}\n".format(line, str(dateTimeNow), optDetails, errorString)
print(newError)
with open(Path(pathError, logFileName), "a") as logFile:
logFile.write(newError)
def sql_connector():
"""sql_connector: Recieves nothing;
creates a connection to the sql server (conncetion details sould be constants);
returns a connection."""
return db.connect("DRIVER="+sqlDriver+"; \
SERVER="+sqlServer+"; \
DATABASE="+sqlDatabase+"; \
UID="+sqlUID+"; \
PWD="+sqlPWD+";")
def sql_update_builder(dbField:str, dbValue:str, dbKey:str) -> str:
""" sql_update_builder: takes strings dbField, dbValue and dbKey;
creates a sql syntax command with the purpose to update the value of the
corresponding field with the corresponding key;
returns a string with a sql command."""
return "\
UPDATE [tbl_Main] \
SET ["+dbField+"]='"+dbValue+"' \
WHERE ((([tbl_Main].MyKey)="+dbKey+"));"
def sql_insert_builder(dbKey: str) -> str:
""" sql_insert_builder: takes strings dbKey;
creates a sql syntax command with the purpose to create a new record;
returns a string with a sql command."""
return "\
INSERT INTO [tbl_Main] ([MyKey])\
VALUES ("+dbKey+")"
def append_filename_to_fileNameFile(xlFilename):
"""recieves anywthing xlFilename;
converts it to string and writes the filename (full path) to a file;
returns nothing."""
with open(Path(pathDone, "filesDone.txt"), "a") as logFile:
logFile.write(str(xlFilename)+"\n")
###################################################################################
###################################################################################
# main loop
while __name__ == "__main__":
###################################################################################
""" 1. load filesDone.txt into set"""
listDone = []
print(line+"reading filesDone.txt in "+pathDone)
try:
with open(Path(pathDone, "filesDone.txt"), "r") as filesDoneFile:
if filesDoneFile:
print("file contains entries")
for filePath in filesDoneFile:
filePath = filePath.replace("\n","")
listDone.append(Path(filePath))
except Exception as err:
errorDescription = "failed to read filesDone.txt from {0}".format(pathDone)
write_error_to_log(description=errorDescription, errorString=str(err))
continue
else: setDone = set(listDone)
###################################################################################
""" 2. load filenames of all .xlsm files into set"""
print(line+"trying to get list of files in filesystem...")
try:
listFileSystem = get_list_of_files_by_extension(path=pathTodo, extension=".xlsm")
except Exception as err:
errorDescription = "failed to read file system "
write_error_to_log(description=errorDescription, errorString=str(err))
continue
else:
listFiles = []
for filename in listFileSystem:
listFiles.append(Path(filename))
setFiles = set(listFiles)
###################################################################################
""" 3. create list of difference of setMatchingFiles and setDone"""
print(line+"trying to compare done files and files in filesystem...")
setDifference = setFiles.difference(setDone)
###################################################################################
""" 4. iterate thru list of files """
for filename in setDifference:
""" 4.1 try: look if file is marked as "finalized=True";
if the xlfile does not have sheet 7 (old ones)
just add the xlfilename to the xlfilenameFile"""
try:
print("{0}trying to read finalized state ... of {1}".format(line, filename))
filenameClean = str(filename).replace("\n","")
xlFile = pd.ExcelFile(filenameClean)
except Exception as err:
errorDescription = "failed to read finalized-state from {0} to dataframe".format(filename)
write_error_to_log(description=errorDescription, errorString=str(err))
continue
else:
if "finalized" in xlFile.sheet_names:
dataframe = xlFile.parse("finalized")
print("finalized state ="+str(dataframe.iloc[0]["finalized"]))
if dataframe.iloc[0]["finalized"] == False:
continue
else:
append_filename_to_fileNameFile(filename) #add the xlfilename to the xlfilenameFile"
continue
###################################################################################
""" 4.2 try: read values to dataframe"""
try:
dataframe = pd.read_excel(Path(filename), sheet_name=4)
except Exception as err:
errorDescription = "Failed to read values from {0} to dataframe".format(filename)
write_error_to_log(description=errorDescription, errorString=str(err))
continue
###################################################################################
""" 4.2 try: open connection to database"""
print("{0}Trying to open connection to database {1} on {2}".format(line, sqlDatabase, sqlServer))
try:
sql_connection = sql_connector() #create connection to server
stuff = sql_connection.cursor()
except Exception as err:
write_error_to_log(description="Failed to open connection:", errorString=str(err))
continue
###################################################################################
""" 4.3 try: write to database"""
headers = list(dataframe) #copy header from dataframe to list; easier to iterate
values = dataframe.values.tolist() #copy values from dataframe to list of lists [[row1][row2]...]; easier to iterate
for row in range(len(values)): #iterate over lines
dbKey = str(values[row][0]) #first col is key
sqlCommandString = sql_insert_builder(dbKey=dbKey)
""" 4.3.1 firts trying to create (aka insert) new record in db ..."""
try:
print("{0}Trying insert new record with the id {1}".format(line, dbKey))
stuff.execute(sqlCommandString)
sql_connection.commit()
print(sqlCommandString)
except Exception as err:
sql_log_string = " ".join(sqlCommandString.split()) #get rid of whitespace in sql command
write_error_to_log(description="Failed to create new record in DB:", errorString=str(err), optDetails=sql_log_string)
else: #if record was created add the values one by one:
print("{0}Trying to add values to record with the ID {1}".format(line, dbKey))
""" 4.3.2 ... than trying to add the values one by one"""
for col in range(1, len(headers)): #skip col 0 (the key)
dbField = str(headers[col]) #field in db is header in the excel sheet
dbValue = str(values[row][col]) #get the corresponding value
dbValue = (dbValue.replace("\"","")).replace("\'","") #getting rid of ' and " to prevent trouble with the sql command
sqlCommandString = sql_update_builder(dbField, dbValue, dbKey) # calling fuction to create a sql update command string
try: #try to commit the sql command
stuff.execute(sqlCommandString)
sql_connection.commit()
print(sqlCommandString)
except Exception as err:
sql_log_string = " ".join(sqlCommandString.split()) #get rid of whitespace in sql command
write_error_to_log(description="Failed to add values in DB:", errorString=str(err), optDetails=sql_log_string)
append_filename_to_fileNameFile(filename)
print(line)
# wait for a certain amount of time
for i in range(tick_time_seconds, 0, -1):
sys.stdout.write("\r" + str(i))
sys.stdout.flush()
time.sleep(1)
sys.stdout.flush()
print(line)
#break # this is for debuggung
推荐阅读
- java - 调用 MediaStore 时,某些媒体提供程序会阻止 UI
- javascript - 从选择框的值重定向到外部 url 的问题
- blazor-server-side - 如何在 Blazor 中访问客户端的 IP 地址
- django - 在 django 中使用正确的 pk 值重定向时出现问题
- php - 如何通过ajax设置相同输入字段但在不同行中的值
- postgresql - Spring Boot Hibernate Update 语句需要 7 秒来持久化 10kb 实体
- reactjs - React Admin:我想路由到
从列表中查看 - python - 从python中二维数组的每一行中随机选择N个元素
- javascript - 使用 firebase 函数的共享模块和打字稿别名
- azure - 在非交互式环境中验证和使用 Firebase