首页 > 技术文章 > pandas(二):在pandas中搜索包含关键词的行

zhangxianrong 2021-05-14 18:43 原文

一、代码

# -*- coding: UTF-8 -*-
import json
import pandas as pd

"""获得所有的文本"""
def get_all_text():
    file_path = "../datas/format/primary.json"
    names = []
    roles = []
    texts = []
    with open(file_path, "r", encoding="utf8") as f:
        for data_line in f.readlines():
            json_data = json.loads(data_line)
            file_name = json_data["file_name"]
            file_data = json_data["datas"]
            for k,v in file_data.items():
                names.append(file_name)
                roles.append(k)
                texts.append(v)
    file_out = "../datas/format/all_text.csv"
    dataframe = pd.DataFrame({'names': names, 'roles': roles, "texts": texts})
    dataframe.to_csv(file_out, index=False, sep='\t')

"""从csv搜索数据"""
def search_text(key):
    file_out = "../datas/classes/" + key + ".csv"
    file_path = "../datas/format/all_text.csv"
    data = pd.read_csv(file_path, sep="\t")
    da = data[data["texts"].str.contains(key)]
    da.to_csv(file_out, index=False, sep='\t')

"""提取带有婚字的数据"""
def data_annotate():
    file_in = "../datas/format/primary.json"
    file_out = "../datas/annotate/label.json"
    with open(file_out, "w", encoding="utf8") as fo:
        with open(file_in, "r", encoding="utf8") as f:
            for line in f.readlines():
                item = {}
                label = 0
                json_data = json.loads(line)
                for k,v in json_data["datas"].items():
                    if "" in v:
                        label = 1
                if label == 1:
                    item["name"] = json_data["file_name"]
                    item["label"] = ""
                    item["datas"] = json_data["datas"]
                    fo.write(json.dumps(item, ensure_ascii=False) + "\n")
    return "success"

"""提取标注过的数据"""
def annotate():
    file_in = "../datas/annotate/label.json"
    file_labeled = "../datas/annotate/labeled.json"
    file_unlabeled = "../datas/annotate/unlabel.json"
    with open(file_in, "r", encoding="utf8") as f_in:
        with open(file_labeled, "w", encoding="utf8") as f_labeled:
            with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled:
                for line in f_in.readlines():
                    json_data = json.loads(line)
                    if json_data["label"]:
                        f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "\n")
                    else:
                        f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "\n")
    return "success"

def label_to_csv():
    file_path = "../datas/annotate/labeled.json"
    labels = []
    datas = []
    data_dict = []
    with open(file_path, "r", encoding="utf8") as f:
        for data_line in f.readlines():
            json_data = json.loads(data_line)
            _label = json_data["label"]
            _data = "|".join(json_data["datas"].values())
            labels.append(_label)
            datas.append(_data)
            data_dict.append(data_line.replace("\n", ""))
    file_out = "../datas/annotate/labeled.csv"
    dataframe = pd.DataFrame({'labels': labels, 'datas': datas, "data_dict": data_dict})
    dataframe.to_csv(file_out, index=False, sep='\t')

"""提取带工作的数据"""
def get_work():
    search_text("工作")

if __name__ == '__main__':
    label_to_csv()

 

推荐阅读