首页 > 解决方案 > 如何矢量化以下函数

问题描述

def get_project_details(pids: list, conn: pymssql._pymssql.Connection) -> dict:
pidq = ",".join([f"'{x}'" for x in pids])
query = f"SELECT * FROM VW_CSMS_PMT_ProjectDetails WHERE Project_Id in ({pidq})"
df = pd.read_sql(
    query,
    con=conn,
    parse_dates=["StartDate", "TargetDate"],
)
out = df.apply(
    lambda x: {
        "id": x["Project_Id"],
        "type": "project",
        "name": x["Project_Name"],
        "start_date": get_date_string(x["StartDate"]),
        "target_date": get_date_string(x["TargetDate"]),
        "estimate": x["EST_HRS"],
        "status": x["status"],
        "timesheet_details": tdp(x["Project_Id"], conn),
        "main_task": get_main_task(x["Project_Id"], conn),
    },
    axis=1,
)

if isinstance(out, pd.core.series.Series):
    out = out.tolist()
else:
    out = None
return out

我正在尝试使这段代码更快,因为它需要永远处理,并且有人建议我使用矢量化操作,这使它更快,所以关于我应该如何应用它的任何想法?

Out 示例(对不起它的一行,但格式是 JSON):

[{"estimate": "0.00", "id": "AAM31040", "main_task": [{"estimate": "200.00", "id": "CDM01", "name": "CDM01", "start_date": "28-Oct-2020", "status": "Open", "sub_task": [{"estimate": "200.00", "id": "BDM01", "name": "BDM01", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 16.0, "actual_start_date": "06-Jan-2021", "actual_target_date": "12-Jan-2021", "approved_hours": 16.0}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM02", "name": "BDM02", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 142.0, "actual_start_date": "05-Jan-2021", "actual_target_date": "07-May-2021", "approved_hours": 136.5}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM03", "name": "BDM03", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 557.5, "actual_start_date": "04-Jan-2021", "actual_target_date": "06-May-2021", "approved_hours": 541.5}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM04", "name": "BDM04", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 20.5, "actual_start_date": "05-Jan-2021", "actual_target_date": "09-Mar-2021", "approved_hours": 20.5}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM05", "name": "BDM05", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 22.0, "actual_start_date": "04-Jan-2021", "actual_target_date": "06-Apr-2021", "approved_hours": 21.0}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM06", "name": "BDM06", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 13.5, "actual_start_date": "07-Jan-2021", "actual_target_date": "09-Feb-2021", "approved_hours": 13.5}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM07", "name": "BDM07", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 6.0, "actual_start_date": "11-Jan-2021", "actual_target_date": "09-Feb-2021", "approved_hours": 6.0}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM08", "name": "BDM08", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 1.0, "actual_start_date": "20-Jan-2021", "actual_target_date": "20-Jan-2021", "approved_hours": 1.0}, "type": "subtask"}, {"estimate": "200.00", "id": "BDM09", "name": "BDM09", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 15.5, "actual_start_date": "06-Jan-2021", "actual_target_date": "08-Feb-2021", "approved_hours": 15.5}, "type": "subtask"}, {"estimate": "200.00", "id": "CDM", "name": "CDM", "start_date": "28-Oct-2020", "status": "Open", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 4.0, "actual_start_date": "12-Jan-2021", "actual_target_date": "29-Mar-2021", "approved_hours": 4.0}, "type": "subtask"}], "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 798.0, "actual_start_date": "04-Jan-2021", "actual_target_date": "07-May-2021", "approved_hours": 775.5}, "type": "maintask"}], "name": "Digital Twin Benchmarking Data_EV and ICE Vehicles", "start_date": "28-Oct-2020", "status": "Active", "target_date": "30-Jun-2021", "timesheet_details": {"actual_hours": 798.0, "actual_start_date": "04-Jan-2021", "actual_target_date": "07-May-2021", "approved_hours": 775.5}, "type": "project"}]

主要任务:

def get_main_task(id: str, conn: pymssql._pymssql.Connection) -> dict:
    df = pd.read_sql(
        (
            "SELECT * FROM VW_CSMS_PMT_TaskDetails "
            f"WHERE ProjectID = '{id}' AND TaskType = 'Main Task'"
        ),
        con=conn,
        parse_dates=["StartDate", "EndDate"],
    )
    out = df.apply(
        lambda x: {
            "id": x["TaskCode"],
            "type": "maintask",
            "name": x["TaskCode"],
            "start_date": get_date_string(x["StartDate"]),
            "target_date": get_date_string(x["EndDate"]),
            "estimate": x["EstHours"],
            "status": x["Status"],
            "timesheet_details": tdmt(id, x["TaskCode"], conn),
            "sub_task": get_sub_task(id, x["TaskCode"], conn),
        },
        axis=1,
    )
    if isinstance(out, pd.core.series.Series):
        out = out.tolist()
    else:
        out = None
    return out

标签: pythonpandas

解决方案


推荐阅读