首页 > 解决方案 > 处理嵌入在字符串中的换行符

问题描述

我正在处理以 jsonl 形式获取的 twitter 数据。我已将其转换为 json 并尝试将其转换为 csv(导入到接受 csv 或 MySQL 的程序中)。然而,有些人在他们的推文或简历中加入了强制换行符。这导致 csv 文件有多行条目,通常在推文中间中断。我已经尝试了一些漂浮在 github 上的 python json 到 csv 代码。

我尝试的最新尝试:

jq -s "." tiny00subset.jsonl > tiny00subset.json
json2csv -i tiny00subset.json -o tiny00subset.csv

部分示例推文(json 格式):

  {
"created_at": "Mon Aug 13 10:40:34 +0000 2018",
"id": 1028954459110555600,
"id_str": "1028954459110555649",
"full_text": "Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk",
"truncated": false,
"display_text_range": [
  0,
  131
],
"entities": {
  "hashtags": [
    {
      "text": "climatechange",
      "indices": [
        117,
        131
      ]
    }
  ],
  "symbols": [],
  "user_mentions": [],
  "urls": [
    {
      "url": "https://REPLACED/DuBGmHCnG8",
      "expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/",
      "display_url": "tamino.wordpress.com/2018/08/08/usa…",
      "indices": [
        93,
        116
      ]
    },
    {
      "url": "https://REPLACED/d5IBchM3Uk",
      "expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720",
      "display_url": "twitter.com/Tony__Heller/s…",
      "indices": [
        132,
        155
      ]
    }
  ]
},

}

CSV 输出:

    "Mon Aug 13 10:40:34 +0000 2018",1028954459110555600,"1028954459110555649","Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.
    https://REPLACED/DuBGmHCnG8
    #climatechange https://REPLACED/d5IBchM3Uk",false,"[0,131]","{""hashtags"":[{""text"":""climatechange"",""indices"":[117,131]}],""symbols"":[],""user_mentions"":[],""urls"":[{""url"":""https://REPLACED/DuBGmHCnG8"",""expanded_url"":""https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/"",""display_url"":""tamino.wordpress.com/2018/08/08/usa…"",""indices"":[93,116]},{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded_url"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display_url"":""twitter.com/Tony__Heller/s…"",""indices"":[132,155]}]}","<a href=""https://about.twitter.com/products/tweetdeck"" rel=""nofollow"">TweetDeck</a>",,,,,,"{""id"":59806323,""id_str"":""59806323"",""name"":""Daniel"",""screen_name"":""sleeksorrow"",""location"":""Karlsruhe, Germany"",""description"":""Politik, IT, Blödsinn und deren Schnittmenge. Ebenfalls: Hochmittelalter Darstellung, Falknerei, Greifvogelschutz - profile picture by @herrkausk"",""url"":""https://REPLACED/E8aNHIhCtg"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/E8aNHIhCtg"",""expanded_url"":""http://sleeksorrow.blogspot.com/"",""display_url"":""sleeksorrow.blogspot.com"",""indices"":[0,23]}]},""description"":{""urls"":[]}},""protected"":false,""followers_count"":572,""friends_count"":392,""listed_count"":47,""created_at"":""Fri Jul 24 15:15:25 +0000 2009"",""favourites_count"":13259,""utc_offset"":null,""time_zone"":null,""geo_enabled"":false,""verified"":false,""statuses_count"":48861,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""1A1B1F"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/59806323/1397029131"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""2FC2EF"",""profile_sidebar_border_color"":""181A1E"",""profile_sidebar_fill_color"":""252429"",""profile_text_color"":""666666"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":true,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""}",,,,,true,1028672939753758700,"1028672939753758720","{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display"":""twitter.com/Tony__Heller/s…""}","{""created_at"":""Sun Aug 12 16:01:55 +0000 2018"",""id"":1028672939753758700,""id_str"":""1028672939753758720"",""full_text"":""@DeanFieldingF1 It is very difficult or impossible for climate alarmists to deal with reality. https://REPLACED/wOJTptxIqH"",""truncated"":false,""display_text_range"":[16,94],""entities"":{""hashtags"":[],""symbols"":[],""user_mentions"":[{""screen_name"":""DeanFieldingF1"",""name"":""Dean Fielding"",""id"":797295219825897500,""id_str"":""797295219825897472"",""indices"":[0,15]}],""urls"":[],""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}}}]},""extended_entities"":{""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}},""ext_alt_text"":null},{""id"":1028672883986333700,""id_str"":""1028672883986333697"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}},""ext_alt_text"":null}]},""source"":""<a href=\""http://twitter.com\"" rel=\""nofollow\"">Twitter Web Client</a>"",""in_reply_to_status_id"":1028671170802081800,""in_reply_to_status_id_str"":""1028671170802081793"",""in_reply_to_user_id"":797295219825897500,""in_reply_to_user_id_str"":""797295219825897472"",""in_reply_to_screen_name"":""DeanFieldingF1"",""user"":{""id"":435704007,""id_str"":""435704007"",""name"":""Tony Heller"",""screen_name"":""Tony__Heller"",""location"":""Colorado"",""description"":""https://REPLACED/j5CaDNyIqE"",""url"":""https://REPLACED/Pyn117xXna"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/Pyn117xXna"",""expanded_url"":""http://realclimatescience.com"",""display_url"":""realclimatescience.com"",""indices"":[0,23]}]},""description"":{""urls"":[{""url"":""https://REPLACED/j5CaDNyIqE"",""expanded_url"":""https://realclimatescience.com/who-is-tony-heller/"",""display_url"":""realclimatescience.com/who-is-tony-he…"",""indices"":[0,23]}]}},""protected"":false,""followers_count"":44955,""friends_count"":374,""listed_count"":886,""created_at"":""Tue Dec 13 10:44:34 +0000 2011"",""favourites_count"":3740,""utc_offset"":null,""time_zone"":null,""geo_enabled"":true,""verified"":false,""statuses_count"":165165,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""185370"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/435704007/1469798959"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""0084B4"",""profile_sidebar_border_color"":""FFFFFF"",""profile_sidebar_fill_color"":""DDEEF6"",""profile_text_color"":""333333"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":false,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""},""geo"":null,""coordinates"":null,""place"":null,""contributors"":null,""is_quote_status"":false,""retweet_count"":16,""favorite_count"":27,""favorited"":false,""retweeted"":false,""possibly_sensitive"":false,""lang"":""en""}",0,0,false,false,false,"en"

标签: jsoncsvnewlinejq

解决方案


starting from

{
  "created_at": "Mon Aug 13 10:40:34 +0000 2018",
  "id": 1028954459110555600,
  "id_str": "1028954459110555649",
  "full_text": "Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk",
  "truncated": false,
  "display_text_range": [
    0,
    131
  ],
  "entities": {
    "hashtags": [
      {
        "text": "climatechange",
        "indices": [
          117,
          131
        ]
      }
    ],
    "symbols": [],
    "user_mentions": [],
    "urls": [
      {
        "url": "https://REPLACED/DuBGmHCnG8",
        "expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/",
        "display_url": "tamino.wordpress.com/2018/08/08/usa…",
        "indices": [
          93,
          116
        ]
      },
      {
        "url": "https://REPLACED/d5IBchM3Uk",
        "expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720",
        "display_url": "twitter.com/Tony__Heller/s…",
        "indices": [
          132,
          155
        ]
      }
    ]
  }
}

and running (it's https://github.com/johnkerl/miller)

mlr --j2c unsparsify input.json >input.csv

you have this kind of output https://gist.github.com/aborruso/6e0361923a3c45b9fe55ebf7590953de#file-output-csv

If you open it as raw you have the carriage return. And a spreasheet read it properly.

Then, using properly the import process you need to use, the \n is not a problem.

enter image description here


推荐阅读