首页 > 技术文章 > 字符串操作、文件操作,英文词频统计预处理

trojans 2019-03-11 21:44 原文

一、字符串操作

  • 解析身份证号:生日、性别、出生地等。
 1 provinces = {'11': '北京市', '12': '天津市', '13': '河北省', '14': '山西省', '15': '内蒙古自治区', '21': '辽宁省',\
 2            '22': '吉林省', '23': '黑龙江省', '31': '上海市', '32': '江苏省', '33': '浙江省', '34': '安徽省', \
 3            '35': '福建省', '36': '江西省', '37': '山东省', '41': '河南省', '42': '湖北省', '43': '湖南省', \
 4            '44': '广东省', '45': '广西壮族自治区', '46': '海南省', '50': '重庆市', '51': '四川省', '52': '贵州省', \
 5            '53': '云南省', '54': '西藏自治区', '61': '陕西省', '62': '甘肃省', '63': '青海省', '64': '宁夏回族自治区', \
 6            '65': '新疆维吾尔自治区', '71': '台湾省','81': '香港特别行政区', '82': '澳门特别行政区'}
 7 city = {"01": "广州市", "02": "韶关市", "03": "深圳市", "04": "珠海市", "05": "汕头市", "06": "佛山市", "07": "江门市", \
 8         "08": "湛江市", "09": "茂名市", "12": "肇庆市", "13": "惠州市", "14": "梅州市", "15": "汕尾市", "16": "河源市", \
 9         "17": "阳江市", "18": "清远市", "19": "东莞市", "20": "中山市", "51": "潮州市", "52": "揭阳市", "53": "云浮市"}
10 id = input('请输入十八位身份证号码: ')
11 id_check = id[17]
12 id_add = id[0:6]
13 id_birth = id[6:14]
14 id_sex = id[14:17]
15 birth_year = id_birth[0:4]
16 birth_moon = id_birth[4:6]
17 birth_day = id_birth[6:8]
18 W = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
19 ID_num = [18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]
20 id_check_num = ['1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2']
21 ID_aXw = 0
22 for i in range(len(W)):
23     ID_aXw = ID_aXw + int(id[i]) * W[i]
24 
25 id_check_index = ID_aXw % 11
26 if id_check == id_check_num[id_check_index]:
27     if len(id) == 18:
28         if int(id_sex) % 2 == 0:
29             print('性别:女')
30         else:
31             print('性别:男')
32         print("出生地为:" + provinces.get(id[0:2]) + city.get(
33             id[2:4]) + "\n" + "出生日期为: " + birth_year + '' + birth_moon + '' + birth_day + '')
34         print("你的身份证号码是:" + id)
35         print("\n身份证号检验:这是一个正确的身份证号码")
36     else:
37         print("\n身份证号检验:" + id + "是一个错误的身份证号码")
38 else:
39     print("\n身份证号检验:" + id + "是一个错误的身份证号码")
解析身份证号

  执行效果如下:

  • 凯撒密码编码与解码
1 code = input("请输入要编码的信息:")
2 print("编码后为:")
3 for i in code:
4     print(chr(ord(i)+3),end="")
5 decode = input("\n请输入要解码的信息:")
6 print("解码后为:")
7 for i in decode:
8     print(chr(ord(i)-3),end="")

  执行效果如下:

 

  • 网址观察与批量生成
1 import webbrowser as web        # 命名为web
2 for i in range(2, 22):
3     URL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
4     print(URL)
5 URL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/2.html'
6 web.open(URL)

   执行效果如下:

 

二、英文词频统计预处理

  • 下载一首英文的歌词或文章或小说。
  • 将所有大写转换为小写
  • 将所有其他做分隔符(,.?!)替换为空格
  • 分隔出一个一个的单词
  • 并统计单词出现的次数。

 

 1 def read_text():
 2     fo=open("C:The_Spectre.txt", "r", encoding="UTF-8-sig")
 3     line = fo.read()
 4     print(line)
 5     print("\n")
 6     fo.close()
 7     return line
 8 
 9 
10 def modify_text():
11     changes = {',', '.', 'don\'t', '\n', 'I\'ve'}
12     text = read_text()
13     for change in changes:
14         if change == 'don\'t':
15             text = text.replace(change, 'do not')
16         elif change == 'I\'ve':
17             text = text.replace(change, 'I have')
18         else:
19             text = text.replace(change, ' ')
20     return text
21 
22 
23 def lowercase_and_count():
24     string = modify_text().lower()
25     result = []
26     print(string.split(' '))
27     print("\n")
28     for word in string.split(' '):
29         if word not in result:
30             result.append(word)
31     for word in result:
32         if word != "":
33             print(word + " 这个词在歌曲中出现了" + str(string.count(word)) + "")
34         else:
35             pass
36     return
37 
38 
39 lowercase_and_count()
词频统计

  执行效果如下:

 

三、文件操作

  • 同一目录、绝对路径、相对路径
  • 凯撒密码:从文件读入密函,进行加密或解密,保存到文件。
  • 词频统计:下载一首英文的歌词或文章或小说,保存为utf8文件。从文件读入文本进行处理。

 

 1 #同一目录
 2 fo=open('text.txt','r',encoding='UTF-8-sig')
 3 content=fo.read()
 4 fo.close()
 5 print(content,end='')
 6 #绝对路径
 7 fo=open(r'C:\Users\Administrator\PycharmProjects\MadLibs\text.txt','r',encoding="UTF-8-sig")
 8 content=fo.read()
 9 fo.close()
10 print(content,end='')
11 #相对路径
12 fo=open(r'./text.txt','r',encoding="UTF-8-sig")
13 content=fo.read()
14 fo.close()
15 print(content,end='')

 

 1 def encode():
 2     fo = open(r'C:\Users\Administrator\PycharmProjects\MadLibs\The_Spectre.txt', 'r', encoding="UTF-8-sig")
 3     s = fo.read()
 4     str = ''
 5     for i in s:
 6         str = str +chr(ord(i)+3)
 7         print(chr(ord(i) + 3), end='')
 8     fo.close()
 9     fo = open(r'C:\Users\Administrator\PycharmProjects\MadLibs\The_Spectre.txt', 'w', encoding="UTF-8-sig")
10     fo.write(str)
11     fo.close()
12     return
13 
14 
15 def decode():
16     fo = open(r'C:\Users\Administrator\PycharmProjects\MadLibs\The_Spectre.txt', 'r', encoding="UTF-8-sig")
17     s = fo.read()
18     str = ''
19     for i in s:
20         str = str + chr(ord(i) - 3)
21         print(chr(ord(i) - 3), end='')
22     fo.close()
23     fo = open(r'C:\Users\Administrator\PycharmProjects\MadLibs\The_Spectre.txt', 'w', encoding="UTF-8-sig")
24     fo.write(str)
25     fo.close()
26     return
27 
28 
29 def main():
30     while True:
31         print(u"1. 加密,2. 解密")
32         choice = input("请选择:")
33         if choice == "1":
34             encode()
35             print("")
36         elif choice == "2":
37             decode()
38         else:
39             print(u"您的输入有误!")
40     return
41 
42 
43 if __name__ == '__main__':
44     main()

  执行效果如下:

 

四、函数定义

  • 加密函数
1 def bian_ma():
2     code = input("请输入要编码的信息:")
3     print("编码后为:")
4     for i in code:
5         print(chr(ord(i)+3),end="")
6     return

 

  • 解密函数
1 def jie_ma():
2     decode = input("\n请输入要解码的信息:")
3     print("解码后为:")
4     for i in decode:
5         print(chr(ord(i) - 3), end="")
6     return

 

  • 读文本函数
1 def read_text():
2     fo=open("C:The_Spectre.txt", "r", encoding="UTF-8-sig")
3     line = fo.read()
4     print(line)
5     print("\n")
6     fo.close()
7     return line

 

 

推荐阅读