ddxiami

 找回密码
 立即注册
搜索
热搜: 活动 交友 discuz
查看: 2577|回复: 0

应用场景05-搜索引擎判断收录

[复制链接]
发表于 2021-5-12 08:51:10 | 显示全部楼层 |阅读模式
#注意蜘蛛的封杀
#返回403则表示被封杀
#提前安装组件
pip3 install web.py

================================
import web
import urllib
import requests
import time
import sys
import random

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


class UnicodeStreamFilter:
    def __init__(self, target):
        self.target = target
        self.encoding = 'utf-8'
        self.errors = 'replace'
        self.encode_to = self.target.encoding

    def write(self, s):
        if type(s) == str:
            s = s.decode("utf-8")
        s = s.encode(self.encode_to, self.errors).decode(self.encode_to)
        self.target.write(s)


if sys.stdout.encoding == 'cp936':
    sys.stdout = UnicodeStreamFilter(sys.stdout)

#随机user_agent,避免被封杀
def get_rand_ua():
    ua = "curl/7.29." + str(random.randint(0, 99999999))
    return ua

#百度收录判断1
def baidu_html(baiduURL):
    x = 1
    while x < 5:
        try:
            headers = {"User-Agent": get_rand_ua()}
            html = requests.get(baiduURL, headers=headers, timeout=30)
            r = html.json()
            break
        except:
            x = x + 1
            continue
    if x >= 5:
        r = {"feed": {"all": "0", "entry": [{"title": "开水网络", "url": "超时,请重查"}]}}
    return r

#百度收录判断2
def get_baidu_html(r, url):
    #被封杀的情况
    if len(r.get('feed').get('entry'))==1:
        return 403

    first = r.get('feed').get('entry')[0].get('url')

    #去除url前缀
    first = first.replace("http://","")
    first = first.replace("https://","")
    url = url.replace("http://","")
    url = url.replace("https://","")
    if first == url:
        return 1
    else:
        return 0      

#360收录判断
def get_so(url):
    myurl = "http://www.so.com/s?q=%s" % url
    headers = {"User-Agent": get_rand_ua()}
    r = requests.get(myurl, headers=headers)
    ret = r.text
    if "找不到该URL" in ret:
        return 0
    if "找到相关结果约" in ret:
        return 1
    return 403

#搜狗收录判断
def get_sogou(url):
    myurl = "http://www.sogou.com/web?query=%s" % url
    headers = {"User-Agent": get_rand_ua()}
    r = requests.get(myurl, headers=headers)
    ret = r.text
    if "您是不是想直接访问" in ret:
        return 0
    if "找到约" in ret:
        return 1
    elif "verify_page" in ret:
        return 403
    return 403

# 定义web服务,路由
urls = (
    '/baidu/(.*)', 'baidu',
    '/so/(.*)', 'so',
    '/sougou/(.*)', 'sougou',
)
app = web.application(urls, globals())

class baidu:
    def GET(self, url):
        baiduURL = 'http://www.baidu.com/s?wd=%s&tn=json' % url.strip()
        r = baidu_html(baiduURL)
        r2 = get_baidu_html(r, url)
        return r2

class so:
    def GET(self, url):
        r = get_so(url.strip())
        return r


class sougou:
    def GET(self, url):
        r = get_sogou(url.strip())
        return r


if __name__ == "__main__":
    app.run()


================================



回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|技术文档库 ( 闽ICP备15017263号-2 )|网站地图

GMT+8, 2025-5-18 19:58 , Processed in 0.034353 second(s), 16 queries .

Powered by Discuz! X3.4

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表