python - 从python中的迭代和混合行中提取字符串
问题描述
我有一个如下数据集;
"birth_date_1:25 birth_date_2:august birth_date_3:1945 birth_place_1:france death_date:<none> "
"birth_date_1:14 birth_date_2:june birth_date_3:1995 birth_place_1:dvůr birth_place_2:králové birth_place_3:nad birth_place_4:labem birth_place_5:, birth_place_6:czech birth_place_7:republic "
"birth_date_1:21 birth_date_2:february birth_date_3:1869 birth_place_1:blackburn birth_place_2:, birth_place_3:england death_date_1:12 death_date_2:march death_date_3:1917 "
"birth_date_1:07 birth_date_2:july birth_date_3:1979 birth_place_1:ghana birth_place_2:, birth_place_3:accra "
"birth_date_1:27 birth_date_2:february birth_date_3:1979 birth_place_1:durban birth_place_2:, birth_place_3:south birth_place_4:africa "
"birth_date_1:1989 birth_place_1:lima birth_place_2:, birth_place_3:peru "
"birth_date_1:5 birth_date_2:september birth_date_3:1980 birth_place_1:angola death_date:<none> "
"birth_date_1:1 birth_date_2:february birth_date_3:1856 birth_place_1:hampstead birth_place_2:, birth_place_3:london death_date_1:14 death_date_2:august death_date_3:1905 "
"birth_date_1:28 birth_date_2:december birth_date_3:1954 birth_place_1:hickory birth_place_2:, birth_place_3:north birth_place_4:carolina death_date:<none> "
"birth_date:<none> "
"birth_date:<none> birth_place:<none> death_date:<none> "
"birth_date:<none> birth_place_1:belfast birth_place_2:, birth_place_3:northern birth_place_4:ireland "
"birth_date:<none> birth_place:<none> death_date:<none> "
"birth_date_1:28 birth_date_2:february birth_date_3:1891 birth_place_1:carberry birth_place_2:, birth_place_3:manitoba death_date_1:20 death_date_2:september death_date_3:1968 "
"birth_date_1:4 birth_date_2:november birth_date_3:1993 birth_place_1:portim√£o birth_place_2:, birth_place_3:portugal "
在这些数据集中,我试图提取如下信息;
25.08.1945 \t France \t NA
14.06.1995 \t Dvůr Králové nad Labem,Czech Republic \t
21.02.1896 \t Blackburn,England \t 12.03.1917
.
.
.
1989 \t Lima,Peru \t NA
.
.
.
NA \t NA \t NA
NA \t NA \t NA
NA \t Belfast, Northern Ireland \t NA
.
.
04.11.1993 \t Portimeo,Portugal \t NA
我编写了下面的代码来实现这一点,但是由于我将在我的数据集中遇到的几种情况,例如birth_date_1信息可以为空、月份名称或年份,我想出的下面的循环感觉就像在某个地方失败并获胜不可行。
outputfile = open('ornek_box_seperated_update.csv','w',encoding="utf-8")
inputfile = open('ornek_box_seperated.csv','r',encoding="utf-8")
import numpy as np
birthDatePlace = [[ np.nan for i in range(9) ] for j in range(20000)]
for line in inputfile:
d = line.split(":")
print(d)
d = line.split(d)
d = "\t".join(d)
print(d)
if(d[1]<40 and d[1]>0):
birthDatePlace[line,1] = d[1]
elif(d[1]<2020):
birthDatePlace[line,3] = d[1]
if(d[1]<40 and d[1]>0 and isinstance(d[3])==str):
birthDatePlace[line,2] = d[3]
elif(d[1]<2020 and isinstance(d[3])==int):
birthDatePlace[line,4] = d[3]
# this code planned to continue from here until cover the all birth place and death date information in required format
outputfile.write(d)
outputfile.write('\n')
outputfile.close()
感谢您提供的任何帮助。我是 python 的新手,尤其是正则表达式或字符串提取方法。
预先感谢您的支持。
解决方案
如果您想避免代码损坏,最好进行明确的检查。请检查下面的代码。我已经解析了信息并将其存储在一个类对象中。该类有一些帮助函数来修改解析的数据。
# -*- coding: utf-8 -*-
# Class for storing parsed information
class Info(object):
def __init__(self, birth_date_1, birth_date_2, birth_date_3, birth_place, death_date_1, death_date_2, death_date_3):
if not (birth_date_1 or birth_date_2 or birth_date_3):
self.birth_date = "NA"
else:
if birth_date_2 and birth_date_2.isalpha():
birth_date_2 = self.month_string_to_number(birth_date_2)
self.birth_date = '.'.join([birth_date_1, birth_date_2, birth_date_3]).strip(".")
self.birth_place = birth_place if birth_place.strip(",") else "NA"
if not (death_date_1 or death_date_2 or death_date_3):
self.death_date = "NA"
else:
if death_date_2 and death_date_2.isalpha():
death_date_2 = self.month_string_to_number(death_date_2)
self.death_date = '.'.join([death_date_1, death_date_2, death_date_3]).strip(".")
self.sanitize()
def print_req_format(self):
print '\t'.join([self.birth_date, self.birth_place, self.death_date])
def sanitize(self):
if "<none>" in self.birth_date:
self.birth_date = "NA"
if "<none>" in self.birth_place:
self.birth_place = "NA"
if "<none>" in self.death_date:
self.death_date = "NA"
def month_string_to_number(self, month):
m = {
'jan': 1,
'feb': 2,
'mar': 3,
'apr': 4,
'may': 5,
'jun': 6,
'jul': 7,
'aug': 8,
'sep': 9,
'oct': 10,
'nov': 11,
'dec': 12
}
s = month.strip()[:3].lower()
try:
out = m[s]
return str(out)
except:
return ""
dataset = [
"birth_date_1:25 birth_date_2:august birth_date_3:1945 birth_place_1:france death_date:<none>",
"birth_date_1:14 birth_date_2:june birth_date_3:1995 birth_place_1:dvůr birth_place_2:králové birth_place_3:nad birth_place_4:labem birth_place_5:, birth_place_6:czech birth_place_7:republic",
"birth_date_1:21 birth_date_2:february birth_date_3:1869 birth_place_1:blackburn birth_place_2:, birth_place_3:england death_date_1:12 death_date_2:march death_date_3:1917",
"birth_date_1:07 birth_date_2:july birth_date_3:1979 birth_place_1:ghana birth_place_2:, birth_place_3:accra",
"birth_date_1:27 birth_date_2:february birth_date_3:1979 birth_place_1:durban birth_place_2:, birth_place_3:south birth_place_4:africa",
"birth_date_1:1989 birth_place_1:lima birth_place_2:, birth_place_3:peru",
"birth_date_1:5 birth_date_2:september birth_date_3:1980 birth_place_1:angola death_date:<none>",
"birth_date_1:1 birth_date_2:february birth_date_3:1856 birth_place_1:hampstead birth_place_2:, birth_place_3:london death_date_1:14 death_date_2:august death_date_3:1905",
"birth_date_1:28 birth_date_2:december birth_date_3:1954 birth_place_1:hickory birth_place_2:, birth_place_3:north birth_place_4:carolina death_date:<none>",
"birth_date:<none>",
"birth_date:<none> birth_place:<none> death_date:<none>",
"birth_date:<none> birth_place_1:belfast birth_place_2:, birth_place_3:northern birth_place_4:ireland",
"birth_date:<none> birth_place:<none> death_date:<none>",
"birth_date_1:28 birth_date_2:february birth_date_3:1891 birth_place_1:carberry birth_place_2:, birth_place_3:manitoba death_date_1:20 death_date_2:september death_date_3:1968",
"birth_date_1:4 birth_date_2:november birth_date_3:1993 birth_place_1:portim√£o birth_place_2:, birth_place_3:portugal",
]
for line in dataset:
split_data_line = line.split()
birth_date_1 = birth_date_2 = birth_date_3 = birth_place = death_date_1 = death_date_2 = death_date_3 = ""
for data in split_data_line:
split_data = data.split(":")
if len(split_data) < 2:
continue
val = split_data[1]
if data.startswith("birth_date_1"):
birth_date_1 = val
elif data.startswith("birth_date_2"):
birth_date_2 = val
elif data.startswith("birth_date_3"):
birth_date_3 = val
elif data.startswith("birth_place"):
if not birth_place or val == ",":
birth_place += val
else:
birth_place += " " + val
elif data.startswith("death_date_1"):
death_date_1 = val
elif data.startswith("death_date_2"):
death_date_2 = val
elif data.startswith("death_date_3"):
death_date_3 = val
info = Info(birth_date_1, birth_date_2, birth_date_3, birth_place, death_date_1, death_date_2, death_date_3)
info.print_req_format()
根据您提供的数据,此代码的输出是:
25.8.1945 france NA
14.6.1995 dvůr králové nad labem, czech republic NA
21.2.1869 blackburn, england 12.3.1917
07.7.1979 ghana, accra NA
27.2.1979 durban, south africa NA
1989 lima, peru NA
5.9.1980 angola NA
1.2.1856 hampstead, london 14.8.1905
28.12.1954 hickory, north carolina NA
NA NA NA
NA NA NA
NA belfast, northern ireland NA
NA NA NA
28.2.1891 carberry, manitoba 20.9.1968
4.11.1993 portim√£o, portugal NA
代码简单易懂。希望这对你有用。干杯。
推荐阅读
- php - 为什么不这样创建对象?
- c# - 如何区分写入异常和阅读异常?
- java - 交换单个字母单词的大小写
- javascript - 刷新 Angular 中的特定组件
- python - 使用不同数量的占位符格式化字符串 python
- javascript - 如何修复功能已在 Jasmine 中发现错误
- jenkins - 詹金斯致命:org.jenkinsci.plugins.tokenmacro.MacroEvaluationException:无法识别的宏
- sql-server - 使用 case 语句修复除以零错误
- xml - 从 XML 合并特定节点并在 XSLT 中添加属性
- php - 如何从 URI 路径参数中获取关联数组?