首页 > 解决方案 > JAVA从html脚本中的变量获取Json对象

问题描述

我有一个演示 html,如下所示:

<!DOCTYPE html>
<title>Test</title>
<script>
    (function (root) {
        /* -- Data -- */
        root.SG || (root.SG = {});
        root.SG.query = "Ana";
        root.SG.timePeriodType = "YEAR";
        root.SG.dataType = "SEARCH_ALL";
        root.SG.path = "\u002Findex\u002FsearchHeat";
        root.SG.data = { "pvList": [[{ "kwdId": 0, "pv": 0, "isPeak": 0, "date": 20200517, "id": 0 }, { "kwdId": 27961, "pv": 150506, "isPeak": 0, "date": 20200518, "id": 13625908607 }, { "kwdId": 27961, "pv": 142961, "isPeak": 0, "date": 20200519, "id": 13625908608 }, { "kwdId": 27961, "pv": 170154, "isPeak": 0, "date": 20200520, "id": 13715544690 }, { "kwdId": 27961, "pv": 160490, "isPeak": 0, "date": 20200521, "id": 13715544691 }]] }
    })(this)</script>

<body>Hello
</body>

</html>

我想使用 Java 来获取 root.SG.data 中的对象。

这是我当前的测试代码,但正则表达式结果为空,找不到root.SG.data

String url = "http://zhishu.sogou.com/index/searchHeat?kwdNamesStr=%E5%AE%A0%E7%89%A9&timePeriodType=YEAR&dataType=SEARCH_ALL&queryType=INPUT";
        Document doc = Jsoup.connect(url).get();
        String script = doc.selectFirst("script").html();
        //System.out.println(script);
        Pattern p = Pattern.compile("root.SG.data = (.*)", Pattern.MULTILINE);
        Matcher m = p.matcher(script);
        String wholedata = m.group(1);

标签: javascriptjava

解决方案


JavaFX 的 WebView 可以做到这一点。我使用 JavaFX 13 在 Java 15 上测试了以下内容:

import javafx.application.Application;
import javafx.concurrent.Worker.State;
import javafx.scene.Scene;
import javafx.scene.layout.BorderPane;
import javafx.scene.web.WebEngine;
import javafx.scene.web.WebView;
import javafx.stage.Stage;
import netscape.javascript.JSObject;

/**
 * JavaFX App
 */
public class App extends Application {

  private WebView webView;
  private Stage stage;

  @Override
  public void start(Stage stage) {
    this.stage = stage;

    //Insert a WebView into scene
    this.webView = new WebView();
    final Scene scene = new Scene(new BorderPane(webView), 640, 480);

    stage.setScene(scene);
    stage.setOnShown(event -> stageReady()); //Continue when stage is rendered
    stage.show();
  }

  private void stageReady() {
    //Prepare web engine
    final WebEngine web = webView.getEngine();
    web.setJavaScriptEnabled(true);
    web.getLoadWorker().stateProperty().addListener(
        (ov, oldState, newState) -> {
          if (newState == State.SUCCEEDED) { //Document is loaded
            //Maybe the operation should be postponed a little to leave time to Javascript to execute
            //It worked as is in my tests though
            
            //Either get SG as an object, then use getMember/getSlot to read properties
            JSObject sg = (JSObject) web.executeScript("window.SG");
            System.out.println(sg);
            //Or get it as a JSON String
            System.out.println(web.executeScript("JSON.stringify(window.SG, null, 2)"));
          }
        });
    //Navigate to document
    web.load(getClass().getResource("jsonvalue.html").toExternalForm());
  }

  public static void main(String[] args) {
    launch();
  }

}

假设您更正的 HTML 以 jsonvalue.html 的形式存储在与App.java相同的目录中:

<!DOCTYPE html>
<html>
<head>
    <title>Test</title>
    <script>
        (function (root) {
            /* -- Data -- */
            root.SG || (root.SG = {});
            root.SG.query = "Ana";
            root.SG.timePeriodType = "YEAR";
            root.SG.dataType = "SEARCH_ALL";
            root.SG.path = "\u002Findex\u002FsearchHeat";
            root.SG.data = { "pvList": [[{ "kwdId": 0, "pv": 0, "isPeak": 0, "date": 20200517, "id": 0 }, { "kwdId": 27961, "pv": 150506, "isPeak": 0, "date": 20200518, "id": 13625908607 }, { "kwdId": 27961, "pv": 142961, "isPeak": 0, "date": 20200519, "id": 13625908608 }, { "kwdId": 27961, "pv": 170154, "isPeak": 0, "date": 20200520, "id": 13715544690 }, { "kwdId": 27961, "pv": 160490, "isPeak": 0, "date": 20200521, "id": 13715544691 }]] }
        })(this);
    </script>
</head>

<body>
    Hello
</body>
</html>

推荐阅读