python - Python - 保存网络抓取的文件 - 波兰语字符编码错误
问题描述
我创建了一个代码块,该代码块在网络上抓取波兰网站的房产列表信息。
import bs4
import csv
from urllib.request import urlopen as Open
from urllib.request import Request
from bs4 import BeautifulSoup as soup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page=1"
req = Request(url=results, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
total_pages = int(page_soup.find("div",{"class":"after-offers clearfix"}).find("ul",{"class":"pager"}).findAll("li")[4].text)
offer_list = []
offer_list.append(["Price",
"Location",
"Forma własności",
"Liczba pięter",
"Liczba pokoi",
"Materiał budynku",
"Ogrzewanie",
"Ogrzewanie",
"Okna",
"Okna",
"Piętro",
"Powierzchnia",
"Rodzaj zabudowy",
"Rok budowy",
"Rynek",
"Stan wykończenia",
"Link"])
for page in range(0, 1):
page += 1
print(page)
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page="+str(page)
#print(results)
req = Request(url=results, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
listings = page_soup.findAll("article",{"data-featured-name":"listing_no_promo"})
#print(len(listings))
for i in listings:
listing = i.a.get("href")
req = Request(url=listing, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
# get location
location = page_soup.find("a", {"href":"#map"}).text.split("}")[2]
# get price
price = page_soup.find("div", {"class":"css-1vr19r7"}).text.replace(" ","").replace("zł","")
# get property features
container = page_soup.find("section", {"class":"section-overview"}).findNext("div").ul.findAll("li")
features = []
for feature in ["Forma własności",
"Liczba pięter",
"Liczba pokoi",
"Materiał budynku",
"Ogrzewanie",
"Okna",
"Piętro",
"Powierzchnia",
"Rodzaj zabudowy",
"Rok budowy",
"Rynek",
"Stan wykończenia"
]:
for contain in container:
if feature in contain.text:
features.append(contain.text.split(":")[1].replace(" m²",""))
break
else: # if we didn't break
features.append("N/A")
offer = [price, location, *features, listing]
offer_list.append(offer)
with open ('filename.csv','w', encoding='utf-8') as file:
writer=csv.writer(file)
for row in offer_list:
writer.writerow(row)
print("data saved")
我已经到了保存文件的阶段,但是波兰语字体被破坏了,例如Åódź, łódzkie
有没有办法让它将波兰语字符转换为纯拉丁语,例如ó
to o
,或者只是保持它们不变的形式?
解决方案
推荐阅读
- java - “无法解析构造函数标签(java.lang.String)”
- c# - Visual Studio 安装程序项目使用旧的安装程序
- javascript - Django Rest 框架:HTTP 401 未经授权的错误
- python - 将混合类型的 DataFrame 拆分为两列?
- facebook - 尝试使用 Buffer REST API 将多个图像上传到 Facebook 和 Twitter
- python-3.x - 检查一串单词是否是一个句子
- lua - 将坐标转换为不同的坐标系?
- typescript - 将 typescript 接口映射到相关接口
- scala - 无法初始化类 com.datastax.spark.connector.types.TypeConverter$
- java - Java 最佳实践 - 对象实例化与对 null 的对象引用