首页 > 解决方案 > 无法解析 Bing 搜索结果

问题描述

我在 Python 中有一个问题。无法解析 Bing 搜索结果。我不知道代码有什么问题。什么都没有出来。该程序只是崩溃了(我也尝试使用其他库 - 但也没有使用它们。

对于解决此问题的任何帮助,我将不胜感激!!!

这是界面本身发生的事情: imgur

代码:

import re, sys
import urllib.request
from urllib import request
from urllib.parse import quote
import html2text
from poisk import *
from PyQt5 import QtCore, QtGui, QtWidgets

class MyWin(QtWidgets.QMainWindow):
    def __init__(self, parent=None):
        QtWidgets.QWidget.__init__(self, parent)
        self.ui = Ui_MainWindow()
        self.ui.setupUi(self)

        self.ui.pushButton.clicked.connect(self.mySearch)
        self.ui.pushButton_2.clicked.connect(self.nextSearch)
        self.ui.pushButton_3.clicked.connect(self.previosSearch)

    def previosSearch(self):
        if((self.flagok-1) >= 0):
            self.flagok=self.flagok - 1
            self.ui.textEdit.setText(self.texts[self.flagok])

    def nextSearch(self):
        if((self.flagok+1) < self.maxflagok):
            self.flagok=self.flagok + 1
            self.ui.textEdit.setText(self.texts[self.flagok])

    def mySearch(self):
        self.ui.textEdit.setText("")
        z = self.ui.lineEdit.text()
        s = 'https://www.bing.com/search?q='+quote(z)
        doc = urllib.request.urlopen(s).read().decode('cp1251',errors='ignore')

        o=re.compile('"url":"(.*?)"')
        l=o.findall(doc)
        sp=[]
        for x in l:
            if((x.rfind('youtube')==-1) and(x.rfind('yandex')==-1) and(x.rfind('mail.ru')==-1) and(x.rfind('.jpg')==-1) and(x.rfind('.png')==-1) and(x.rfind('.gif')==-1)):
                sp.append(x)

        sp = dict(zip(sp, sp)).values()
        sp1=[]
        for s in sp:
            sp1.append(s)
        kolotvetov=int(self.ui.lineEdit_2.text())
        if(kolotvetov<2):
            kolotvetov=2
        sp1=sp1[:kolotvetov]

        sp=sp1
        self.progresscount=int(100/len(sp))
        self.progresscount2=int(100/len(sp))
        self.texts=[]

        self.ui.progressBar.setValue(0)

        for s in sp:
            try:
                self.ui.progressBar.setValue(self.progresscount2)
                self.progresscount2 = self.progresscount2 + self.progresscount
                # Now we will take turns receiving the texts of each page from the search results in the doc variable
                doc = urllib.request.urlopen(s).read().decode('utf-8',errors='ignore')
                h = html2text.HTML2Text()
                h.ignore_links = True
                h.body_width = False
                h.ignore_images = True
                doc = h.handle(doc)
                summa=""
                # Divide the text of the page into paragraphs
                ss=doc.split("\n")
                for xx in ss:
                    xx=xx.strip()
                    # We filter paragraphs so that they do not start with incorrect characters and end with the correct ones - a point or !?;
                    if((len(xx)>50) and (xx.startswith('&')==False) and (xx.startswith('>')==False) and (xx.startswith('*')==False) and (xx.startswith('\\')==False) and (xx.startswith('<')==False) and (xx.startswith('(')==False) and (xx.startswith('#')==False) and (xx.endswith('.') or xx.endswith('?') or xx.endswith('!') or xx.endswith(';'))):
                        summa = summa + xx + "\n \n"
                if(len(summa)>500):
                    self.texts.append(summa)
            except Exception:
                print('Something went wrong')
        self.flagok=0
        self.maxflagok=len(self.texts)
        self.ui.textEdit.setText(self.texts[0])
        self.ui.progressBar.setValue(100)


if __name__=="__main__":
    app = QtWidgets.QApplication(sys.argv)
    myapp = MyWin()
    myapp.show()
    sys.exit(app.exec_())

没有 Qt:

import re, sys
import urllib.request
from urllib import request
from urllib.parse import quote
import html2text

# We will enter a search query and get a page with search results from Bing in the doc variable

print("\n---------\n")
z=input("Enter your question: ")
print("\n---------\n")
s = 'https://www.bing.com/search?q='+quote(z)
doc = urllib.request.urlopen(s).read().decode('cp1251',errors='ignore')

# The sp list will receive all links to search results from this page.

o=re.compile('"url":"(.*?)"')
l=o.findall(doc)
sp=[]
for x in l:
    if((x.rfind('youtube')==-1) and(x.rfind('yandex')==-1) and(x.rfind('mail.ru')==-1) and(x.rfind('.jpg')==-1) and(x.rfind('.png')==-1) and(x.rfind('.gif')==-1)):
        sp.append(x)

sp = dict(zip(sp, sp)).values()

for s in sp:
    try:
        # Now we will take turns receiving the texts of each page from the search results in the doc variable
        doc = urllib.request.urlopen(s).read().decode('utf-8',errors='ignore')
        h = html2text.HTML2Text()
        h.ignore_links = True
        h.body_width = False
        h.ignore_images = True
        doc = h.handle(doc)
        summa=""
        # Divide the text of the page into paragraphs
        ss=doc.split("\n")
        for xx in ss:
            xx=xx.strip()
            # We filter paragraphs so that they do not start with incorrect characters and end with the correct ones - a point or !?;
            if((len(xx)>50) and (xx.startswith('&')==False) and (xx.startswith('>')==False) and (xx.startswith('*')==False) and (xx.startswith('\\')==False) and (xx.startswith('<')==False) and (xx.startswith('(')==False) and (xx.startswith('#')==False) and (xx.endswith('.') or xx.endswith('?') or xx.endswith('!') or xx.endswith(';'))):
                summa = summa + xx + "\n \n"
        if(len(summa)&rt > 500):
            print(summa+"\n----------------------------------------\n")
    except Exception:
        print('Something went wrong')

标签: pythonparsingurllib

解决方案


推荐阅读