首页 > 解决方案 > 将json转换为excel时在python中出现类型错误

问题描述

我正在尝试将 json 转换为 excel 文件,但我遇到类型错误并且我无法解决该错误。该代码首先将 json 数据转换为字典(我也需要用于剪辑事实)而不是将其转换为 excel。任何帮助,将不胜感激。先感谢您。以下是我的代码:

    from io import BytesIO
    import sys
    sys.path.append('/usr/lib/python3/dist-packages')
    import json
    import openpyxl
    from openpyxl import load_workbook
            
    import logging
    logger = logging.getLogger('ExtractJSON')
    
    
    class ExtractJSON(object):
    
    def __init__(self):
        """ Initialisierung """
        pass

    def convert(self, data):
        """ PDF Extract API """
        #jsondata = self.json2excel(data)
        jsondata = self.json2csv(data)
        return jsondata
        
    #=======================================================================
    #   JSON to Excel
    #=======================================================================
        
    def json2excel(self, data):
        """ JSON to Excel """
        header = ["Seite","ID","Text","Pfad","Fett","Kursiv","Unterstrichen","Versal","PT","Alignment","BlockAlign","InlineAlign","Col","Row","Width",\
                  "Height","LineHeight","SpaceAfter","Placement","Bemerkung","Bounds","ClipBounds","BBox","Font","HasClip","Lang"]
        jsondata = json.loads(data)
        elements = jsondata['elements']
        excelrow = 1
        sid = 0
        hd = 1
        wb = openpyxl.Workbook()
        sheet = wb.active
        
        for head in header:
            sheet.cell(excelrow, hd).value = head
            hd += 1

        for elm in elements:
            sid += 1
            excelrow += 1
            cl = 0
            row = self.createRow(elm, sid)
            if row is None:
                sheet.cell(excelrow, 1).value = "....."
                continue
            for column in header:  # TODO: enumerate!
                cl += 1
                sheet.cell(excelrow, cl).value = row.get(column, '?????')
            kids = elm.get('Kids', [])
            kid = 0
            for kidelm in kids:
                kid += 1
                excelrow += 1
                cl = 0
                sidstr = f"{sid}.{kid}"
                row = self.createRow(kidelm, sidstr)
                for column in header:  # TODO: enumerate! # TODO: refactor
                    cl += 1
                    sheet.cell(excelrow, cl).value = row.get(column, '?????')
        # FIXME: Check: Do we have recursive 'Kids' in the JSON file?
        wbresult = BytesIO()
        wb.save(wbresult)
        return wbresult.getvalue()

    def createRow(self, elm, sid):
        """ For Excel """
        try:
            page = elm.get('Page', '')
            if page or page == 0: 
                page += 1
            xpath = elm['Path']
            text = elm.get('Text', elm.get('text', ''))
            font = elm.get('Font', {})
            if not text and not font and xpath.find("Figure") > -1:  # TODO temporary
                logger.info("Figure found, no font and no text, continue..")
                return ""
            bold = "x" if font.get('weight', '') == 700 else ''
            italic = "x" if font.get('italic', '') else ''
            underline = font.get('underline', '')  # TODO No example yet
            versal = font.get('capital', '')  # TODO No example yet
            fontsize = elm.get('TextSize', '')
            if fontsize and (abs(round(fontsize) - fontsize) < 0.10):
                fontsize = round(fontsize)
            attributes = elm.get('attributes', {})
            alignment = attributes.get('TextAlign', '')
            lineHeight = attributes.get('LineHeight', '')
            spaceAfter = attributes.get('SpaceAfter', '')
            placement = attributes.get('Placement', '')
            note = ""
            bounds = elm.get('Bounds', '')
            clipBounds = elm.get('ClipBounds', '')
            if bounds:
                bounds = ", ".join([str(round(bd)) for bd in bounds])
            if clipBounds:
                clipBounds = ", ".join([str(round(bd)) for bd in clipBounds])
            font = str(font)
            hasClip = elm.get('HasClip', '')
            lang = elm.get('Lang', '')
            # Tabelle
            blockAlign = attributes.get('BlockAlign', '')
            inlineAlign = attributes.get('InlineAlign', '')
            col = attributes.get('ColIndex', '')
            row = attributes.get('RowIndex', '')
            if col or col == 0: 
                col += 1
            if row or row == 0:
                row += 1
            width = attributes.get('width', '')
            height = attributes.get('height', '')
            bBox = attributes.get('BBox', '')
            if bBox:
                bBox = ", ".join([str(round(bd)) for bd in bBox])
            #
            result = {'Seite':page, 'ID':sid, 'Text':text, 'Pfad':xpath, 'Fett':bold, 'Kursiv':italic, 'Unterstrichen':underline,\
                     'Versal':versal, 'PT':fontsize, 'Alignment':alignment, 'BlockAlign':blockAlign, 'InlineAlign':inlineAlign,\
                     'Col':col, 'Row':row, 'Width':width, 'Height':height, 'LineHeight':lineHeight, 'SpaceAfter':spaceAfter,\
                     'Placement':placement, 'Bemerkung':note, 'Bounds':bounds, 'ClipBounds':clipBounds, 'BBox':bBox,\
                     'Font':font, 'HasClip':hasClip, 'Lang':lang}
        except Exception as e:
            logger.exception(e)
            logger.info(elm)
            return ""
        return result
        
    #=======================================================================
    #   JSON to CSV  (Alternative for JSON to Excel)
    #=======================================================================
        
    def json2csv(self, data):
        """ JSON to CSV """
        result = "Seite;ID;Text;Pfad;Fett;Kursiv;Unterstrichen;Versal;PT;Alignment;BlockAlign;InlineAlign;Col;Row;Width;Height;LineHeight;SpaceAfter;Placement;"\
                 "Bemerkung;Bounds;ClipBounds;BBox;Font;HasClip;Lang\n"
        jsondata = json.loads(data)
        elements = jsondata['elements']
        sid = 0
        for elm in elements:
            sid += 1
            result += self.createRow(elm, sid)
            kids = elm.get('Kids', [])
            kid = 0
            for kidelm in kids:
                kid += 1
                sidstr = f"{sid}.{kid}"
                result += self.createRowForCSV(kidelm, sidstr)
        filename=json2excel.csv")
        return result
        
    def createRowForCSV(self, elm, sid):
        """ """
        try:
            page = elm.get('Page', '')
            if page or page == 0: 
                page += 1
            xpath = elm['Path']
            text = elm.get('Text', elm.get('text', ''))
            font = elm.get('Font', {})
            if not text and not font and xpath.find("Figure") > -1:  # TODO temporary
                logger.info("Figure found, no font and no text, continue..")
                return ""
            bold = "x" if font.get('weight', '') == 700 else ''
            italic = "x" if font.get('italic', '') else ''
            underline = font.get('underline', '')  # TODO No example yet
            versal = font.get('capital', '')  # TODO No example yet
            fontsize = elm.get('TextSize', '')
            if fontsize and (abs(round(fontsize) - fontsize) < 0.10):
                fontsize = round(fontsize)
            attributes = elm.get('attributes', {})
            alignment = attributes.get('TextAlign', '')
            lineHeight = attributes.get('LineHeight', '')
            spaceAfter = attributes.get('SpaceAfter', '')
            placement = attributes.get('Placement', '')
            note = ""
            bounds = elm.get('Bounds', '')
            clipBounds = elm.get('ClipBounds', '')
            if bounds:
                bounds = ", ".join([str(round(bd)) for bd in bounds])
            if clipBounds:
                clipBounds = ", ".join([str(round(bd)) for bd in clipBounds])
            font = str(font)
            hasClip = elm.get('HasClip', '')
            lang = elm.get('Lang', '')
            # Tabelle
            blockAlign = attributes.get('BlockAlign', '')
            inlineAlign = attributes.get('InlineAlign', '')
            col = attributes.get('ColIndex', '')
            row = attributes.get('RowIndex', '')
            if col or col == 0: 
                col += 1
            if row or row == 0:
                row += 1
            width = attributes.get('width', '')
            height = attributes.get('height', '')
            bBox = attributes.get('BBox', '')
            if bBox:
                bBox = ", ".join([str(round(bd)) for bd in bBox])
            #
            result = f"{page};'{sid}';'{text}';{xpath};{bold};{italic};{underline};{versal};{fontsize};{alignment};{blockAlign};{inlineAlign};{col};{row};\
                        {width};{height};'{lineHeight}';'{spaceAfter}';{placement};{note};{bounds};{clipBounds};{bBox};{font};{hasClip};{lang}\n"
        except Exception as e:
            logger.exception(e)
            logger.info(elm)
            return ""
        return result

INPUTPATH = "/home/abc/Clips/JSON/structuredData.json"
OUTPUTPATH = "/home/abc/Clips/JSON/result.xlsx"
        
if __name__ == '__main__':
    converter = ExtractJSON()
    f = open(INPUTPATH, "r"); data = f.read(); f.close()
    print ("JSON converting..")
    result = converter.convert(data)
    g = open(OUTPUTPATH, "wb"); g.write(result); g.close()
    print (f"Excel stored on {OUTPUTPATH}")
    sys.exit(0)

我得到的错误是:

JSON converting..
Traceback (most recent call last):
  File "/home/abc/Clips/JSON/json2excel.py", line 222, in <module>
    result = converter.convert(data)
  File "/home/abc/Clips/JSON/json2excel.py", line 23, in convert
    jsondata = self.json2csv(data)
  File "/home/abc/Clips/JSON/json2excel.py", line 146, in json2csv
    result += self.createRow(elm, sid)
TypeError: can only concatenate str (not "dict") to str

标签: pythonjson

解决方案


长话短说,这是您的代码的最小可重现示例

class ExtractJSON:
    def createRowForCSV(self):
        return {
            'Seite':1, 'ID':2, 'Text':3, 'Pfad':4, 'Fett':5, 'Kursiv':6, 'Unterstrichen':7,
            'Versal':9, 'PT':0, 'Alignment':10, 'BlockAlign':11, 'InlineAlign':12,
            'Col':13, 'Row':14, 'Width':15, 'Height':16, 'LineHeight':17, 'SpaceAfter':18,
            'Placement':19, 'Bemerkung':20, 'Bounds':21, 'ClipBounds':22, 'BBox':23,
            'Font':2.4, 'HasClip':"twenty-five", 'Lang':"twenty-6"
        }

    def json2csv(self):
        result = "Seite;ID;Text;Pfad;Fett;Kursiv;Unterstrichen;Versal;PT;Alignment;BlockAlign;InlineAlign;Col;Row;Width;Height;LineHeight;SpaceAfter;Placement;Bemerkung;Bounds;ClipBounds;BBox;Font;HasClip;Lang\n"
        result += self.createRowForCSV()
        return result

输出

Traceback (most recent call last):
  File "Main.py", line 28, in <module>
    print(ExtractJSON().json2csv())
  File "Main.py", line 17, in json2csv
    result += self.createRowForCSV()
TypeError: can only concatenate str (not "dict") to str

现在,我们可以清楚地看到错误的原因是它result是一个str但你正在附加到它的输出createRowForCSV是一个字典。为了进一步可视化这一点,您就像这样做:

"Some text" + {"another": "one"}

这显然会失败。

解决方案1:

手动迭代dict并按顺序获取值。

class ExtractJSON:
    ...
    def json2csv(self):
        result = "Seite;ID;Text;Pfad;Fett;Kursiv;Unterstrichen;Versal;PT;Alignment;BlockAlign;InlineAlign;Col;Row;Width;Height;LineHeight;SpaceAfter;Placement;Bemerkung;Bounds;ClipBounds;BBox;Font;HasClip;Lang\n"
        result_field_names = result.strip().split(';')  # Get the order of the keys

        result_dict = self.createRowForCSV()  # Get the row in dict format
        result_string = ";".join(map(str, [result_dict[key] for key in result_field_names]))  # Get the values in order and combine them into a string with ';' as the separator
        result += result_string + "\n"  # Append to the result

        return result
...

解决方案2:

使用csv.DictWriter将响应字典转换createRowForCSV为 csv 字符串。

import csv
from io import StringIO

class ExtractJSON:
    ...
    def json2csv(self):
        result = "Seite;ID;Text;Pfad;Fett;Kursiv;Unterstrichen;Versal;PT;Alignment;BlockAlign;InlineAlign;Col;Row;Width;Height;LineHeight;SpaceAfter;Placement;Bemerkung;Bounds;ClipBounds;BBox;Font;HasClip;Lang\n"
        result_field_names = result.strip().split(';')  # Get the order of the keys

        result_dict = self.createRowForCSV()  # Get the row in dict format
        result_string = StringIO()  # Create the string that will hold the result
        csv.DictWriter(result_string, result_field_names, delimiter=';').writerow(result_dict)  # Convert the dict to str
        result_string.seek(0)  # Since result_string is a file-like object, go back to the beginning
        result += result_string.read()  # Read the result string from the beginning to end and append to the result

        return result
...

输出

>>> print(ExtractJSON().json2csv())
Seite;ID;Text;Pfad;Fett;Kursiv;Unterstrichen;Versal;PT;Alignment;BlockAlign;InlineAlign;Col;Row;Width;Height;LineHeight;SpaceAfter;Placement;Bemerkung;Bounds;ClipBounds;BBox;Font;HasClip;Lang
1;2;3;4;5;6;7;9;0;10;11;12;13;14;15;16;17;18;19;20;21;22;23;2.4;twenty-five;twenty-6

推荐阅读