java - 使用 Java 将大型 JSON 文件拆分为较小的 JSON 文件
问题描述
我有一个 JSON 格式的大型数据集,为了便于使用,我想将其拆分为多个 json 文件,同时仍保持结构。例如:{
"{"users": [
{
"userId": 1,
"firstName": "Krish",
"lastName": "Lee",
"phoneNumber": "123456",
"emailAddress": "krish.lee@learningcontainer.com"
},
{
"userId": 2,
"firstName": "racks",
"lastName": "jacson",
"phoneNumber": "123456",
"emailAddress": "racks.jacson@learningcontainer.com"
},
{
"userId": 3,
"firstName": "denial",
"lastName": "roast",
"phoneNumber": "33333333",
"emailAddress": "denial.roast@learningcontainer.com"
},
{
"userId": 4,
"firstName": "devid",
"lastName": "neo",
"phoneNumber": "222222222",
"emailAddress": "devid.neo@learningcontainer.com"
},
{
"userId": 5,
"firstName": "jone",
"lastName": "mac",
"phoneNumber": "111111111",
"emailAddress": "jone.mac@learningcontainer.com"
}
]
}
我应该能够以每个用户 ID 转到不同文件的方式拆分它。到目前为止,我已经尝试将它们放到地图上并尝试拆分地图,并将其转换为数组并拆分数组,但运气不佳。这些文件包含用户 ID,但它不再是 json 格式关于如何在 Java 中实现这一点的任何建议?
预期结果:{"users": [
{
"userId": 1,
"firstName": "Krish",
"lastName": "Lee",
"phoneNumber": "123456",
"emailAddress": "krish.lee@learningcontainer.com"
}
]
}
解决方案
要处理大文件,更喜欢使用面向流/事件的解析。Gson 和 Jackson 都支持这种方式。只是一个带有小型 JSON 解析器的插图https://github.com/anatolygudkov/green-jelly:
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.io.Writer;
public class SplitMyJson {
private static final String jsonToSplit = "{\"users\": [\n" +
" {\n" +
" \"userId\": 1,\n" +
" \"firstName\": \"Krish\",\n" +
" \"lastName\": \"Lee\",\n" +
" \"phoneNumber\": \"123456\",\n" +
" \"emailAddress\": \"krish.lee@learningcontainer.com\"\n" +
" },\n" +
" {\n" +
" \"userId\": 2,\n" +
" \"firstName\": \"racks\",\n" +
" \"lastName\": \"jacson\",\n" +
" \"phoneNumber\": \"123456\",\n" +
" \"emailAddress\": \"racks.jacson@learningcontainer.com\"\n" +
" },\n" +
" {\n" +
" \"userId\": 3,\n" +
" \"firstName\": \"denial\",\n" +
" \"lastName\": \"roast\",\n" +
" \"phoneNumber\": \"33333333\",\n" +
" \"emailAddress\": \"denial.roast@learningcontainer.com\"\n" +
" },\n" +
" {\n" +
" \"userId\": 4,\n" +
" \"firstName\": \"devid\",\n" +
" \"lastName\": \"neo\",\n" +
" \"phoneNumber\": \"222222222\",\n" +
" \"emailAddress\": \"devid.neo@learningcontainer.com\"\n" +
" },\n" +
" {\n" +
" \"userId\": 5,\n" +
" \"firstName\": \"jone\",\n" +
" \"lastName\": \"mac\",\n" +
" \"phoneNumber\": \"111111111\",\n" +
" \"emailAddress\": \"jone.mac@learningcontainer.com\"\n" +
" }\n" +
" ]\n" +
"}";
public static void main(String[] args) {
final JsonParser parser = new JsonParser();
parser.setListener(new Splitter(new File("/home/gudkov/mytest")));
parser.parse(jsonToSplit); // if you read a file, call parse() several times part by part in a loop until EOF
parser.eoj(); // and then call .eoj()
}
static class Splitter extends JsonParserListenerAdaptor {
private final JsonGenerator jsonGenerator = new JsonGenerator();
private final AppendableWriter<Writer> appendableWriter = new AppendableWriter<>();
private final File outputFolder;
private int objectDepth;
private int userIndex;
Splitter(final File outputFolder) {
this.outputFolder = outputFolder;
if (!outputFolder.exists()) {
outputFolder.mkdirs();
}
jsonGenerator.setOutput(appendableWriter);
}
private boolean userJustStarted() {
return objectDepth == 2;
}
private boolean userJustEnded() {
return objectDepth == 1;
}
private boolean notInUser() {
return objectDepth < 2;
}
@Override
public boolean onObjectStarted() {
objectDepth++;
if (notInUser()) return true;
if (userJustStarted()) {
try {
appendableWriter.set(new FileWriter(new File(outputFolder, "user-" + userIndex + ".json")));
} catch (IOException e) {
throw new UncheckedIOException(e);
}
userIndex++;
}
jsonGenerator.startObject();
return true;
}
@Override
public boolean onObjectEnded() {
if (notInUser()) {
objectDepth--;
return true;
}
objectDepth--;
jsonGenerator.endObject();
if (userJustEnded()) { // user object ended
try {
jsonGenerator.eoj();
appendableWriter.output().close();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
return true;
}
@Override
public boolean onArrayStarted() {
if (notInUser()) return true;
jsonGenerator.startArray();
return true;
}
@Override
public boolean onArrayEnded() {
if (notInUser()) return true;
jsonGenerator.endArray();
return true;
}
@Override
public boolean onObjectMember(final CharSequence name) {
if (notInUser()) return true;
jsonGenerator.objectMember(name);
return true;
}
@Override
public boolean onStringValue(final CharSequence data) {
if (notInUser()) return true;
jsonGenerator.stringValue(data, true);
return true;
}
@Override
public boolean onNumberValue(final JsonNumber number) {
if (notInUser()) return true;
jsonGenerator.numberValue(number);
return true;
}
@Override
public boolean onTrueValue() {
if (notInUser()) return true;
jsonGenerator.trueValue();
return true;
}
@Override
public boolean onFalseValue() {
if (notInUser()) return true;
jsonGenerator.falseValue();
return true;
}
@Override
public boolean onNullValue() {
if (notInUser()) return true;
jsonGenerator.nullValue();
return true;
}
}
}
通过这种方式,您可以轻松地为非常大的文件实现过滤、聚合等,并且在常规 Java 中具有最高的性能。
推荐阅读
- javascript - 有条件地公开访问 Firestore 集合文档
- position - 当我修复导航栏时内容消失
- javascript - 使用单选按钮禁用日期 (JQUERY)
- symfony - Symfony - 创建 ParamConverter - 无法注册转换器名称
- javascript - 选择文件时不会触发文件输入 onchange 功能
- python - 如何将行转换为熊猫数据框中的列
- angular - Ionic 5 中的 MatStepperModule 无法获取
- gatsby - Gatsby - DOC 网站的最小源代码
- javascript - 如何使用 javascript(不是 jQuery)将复选框的值传递给数组?
- docker - 是否需要将 bash cmd 添加到 dockerfile?