首页 > 解决方案 > 如何将具有重复键的 JSON 文件导入数据框?

问题描述

例如:我有一个程序可以在 JSON 文件中生成这样的使用日志。JSON 文件日志包含许多称为“活动”的相同键,如下所示:

  "probe": "PROCESS_PROBE",
  "status": "ProcessCreated",
  "processName": "backgroundTaskHost.exe",
  "path": "C:\\WINDOWS\\system32\\backgroundTaskHost.exe",
  "creationClassName": "Win32_Process",
  "handle": "21632",
  "priority": "Normal",
  "commandLine": "\"C:\\WINDOWS\\system32\\backgroundTaskHost.exe\" -ServerName:CortanaUI.AppXy7vb4pc2dr3kc93kfc509b1d0arkfb2x.mca",
  "handleCount": 236,
  "processId": 21632,
  "parentProcessId": 112,
  "pageFileUsage": 4244,
  "creationDate": "20200410172922.614702+120",
  "annotations": {
    "userName": "datta",
    "timeSinceStartup": 259878750,
    "ticksOfEvent": 637221365629757593
  }
},
"activity":{
  "probe": "PROCESS_PROBE",
  "status": "ProcessDeleted",
  "processName": "RuntimeBroker.exe",
  "path": "C:\\Windows\\System32\\RuntimeBroker.exe",
  "creationClassName": "Win32_Process",
  "handle": "8504",
  "priority": "Normal",
  "handleCount": 285,
  "processId": 8504,
  "parentProcessId": 112,
  "pageFileUsage": 3180,
  "creationDate": "20200410172757.934567+120",
  "terminationDate": null,
  "annotations": {
    "userName": "datta",
    "timeSinceStartup": 259883953,
    "ticksOfEvent": 637221365681937472
  }
},
"activity":{
  "probe": "FILERESOURCE_PROBE",
  "status": "Changed",
  "path": "C:\\Users\\datta\\eclipse\\jee-2019-12",
  "entityName": "eclipse",
  "extension": "",
  "attributes": "Directory",
  "owner": "null",
  "length": 0,
  "isReadOnly": false,
  "creationTime": "2020-01-17T09:42:08.5092897+01:00",
  "lastWriteTime": "2020-03-25T10:56:10.7382329+01:00",
  "lastAccessTime": "2020-04-10T17:29:29.9811767+02:00",
  "annotations": {
    "userName": "datta",
    "timeSinceStartup": 259885750,
    "ticksOfEvent": 637221365699837331
  }
},
"activity":{
  "probe": "FILERESOURCE_PROBE",
  "status": "Changed",
  "path": "C:\\Users\\datta\\eclipse",
  "entityName": "jee-2019-12",
  "extension": "",
  "attributes": "Directory",
  "owner": "null",
  "length": 0,
  "isReadOnly": false,
  "creationTime": "2020-01-17T09:42:08.5083+01:00",
  "lastWriteTime": "2020-01-17T09:42:08.5092897+01:00",
  "lastAccessTime": "2020-04-10T17:29:29.9801436+02:00",
  "annotations": {
    "userName": "datta",
    "timeSinceStartup": 259885750,
    "ticksOfEvent": 637221365699906960
  }
},
"activity":{
  "probe": "FILERESOURCE_PROBE",
  "status": "Changed",
  "path": "C:\\Users\\datta",
  "entityName": "eclipse",
  "extension": "",
  "attributes": "Directory",
  "owner": "null",
  "length": 0,
  "isReadOnly": false,
  "creationTime": "2020-01-17T09:42:08.5083+01:00",
  "lastWriteTime": "2020-01-17T09:42:08.5083+01:00",
  "lastAccessTime": "2020-04-10T17:29:29.9922013+02:00",
  "annotations": {
    "userName": "datta",
    "timeSinceStartup": 259885765,
    "ticksOfEvent": 637221365699922013
  }
}
}

我想将活动键中的数据作为数据框的列加载。例如,每个活动都是数据框中的一行,列是“probe”、“status”、“processName”等。

问题是当我使用加载数据时logData = json.load(logfile),它只加载最后一个活动密钥,因为它因为重复而被覆盖。我尝试使用logData = json.load(logfile, object_pairs_hook=tuple)它加载数据作为一个巨大的元组加载数据。我不确定如何实现我试图获得的数据框。提前致谢。

标签: pythonjsonpandasdataframetuples

解决方案


请参阅JSON 语法是否允许对象中有重复键?

这里的问题不在于 JSON,而在于您使用的目标结构。Pythons 的json模块定义了将 JSON 对象导入字典,因此无法处理重复的属性(键)。

这里真正的问题在于这个 JSON 的生产者。制作一个记录列表,甚至是一个字典列表("activity"这是每个列表中的唯一键)都非常容易。由于他们自己的原因,生产者选择创建这种结构,它(几乎)合法,但大多数 JSON 消费者无法处理(我知道 Python、PHP,最重要的是 JavaScript,都会偶然发现这一点)。

假设生成您尝试读取的文件的程序没有通过 JSON 包生成它(至少不是整个文件)也是相当安全的。它可能会生成文本块并将它们附加到流中。


推荐阅读