#注意蜘蛛的封杀
#返回403则表示被封杀
#提前安装组件
pip3 install web.py
================================
import web
import urllib
import requests
import time
import sys
import random
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
class UnicodeStreamFilter:
def __init__(self, target):
self.target = target
self.encoding = 'utf-8'
self.errors = 'replace'
self.encode_to = self.target.encoding
def write(self, s):
if type(s) == str:
s = s.decode("utf-8")
s = s.encode(self.encode_to, self.errors).decode(self.encode_to)
self.target.write(s)
if sys.stdout.encoding == 'cp936':
sys.stdout = UnicodeStreamFilter(sys.stdout)
#随机user_agent,避免被封杀
def get_rand_ua():
ua = "curl/7.29." + str(random.randint(0, 99999999))
return ua
#百度收录判断1
def baidu_html(baiduURL):
x = 1
while x < 5:
try:
headers = {"User-Agent": get_rand_ua()}
html = requests.get(baiduURL, headers=headers, timeout=30)
r = html.json()
break
except:
x = x + 1
continue
if x >= 5:
r = {"feed": {"all": "0", "entry": [{"title": "开水网络", "url": "超时,请重查"}]}}
return r
#百度收录判断2
def get_baidu_html(r, url):
#被封杀的情况
if len(r.get('feed').get('entry'))==1:
return 403
first = r.get('feed').get('entry')[0].get('url')
#去除url前缀
first = first.replace("http://","")
first = first.replace("https://","")
url = url.replace("http://","")
url = url.replace("https://","")
if first == url:
return 1
else:
return 0
#360收录判断
def get_so(url):
myurl = "http://www.so.com/s?q=%s" % url
headers = {"User-Agent": get_rand_ua()}
r = requests.get(myurl, headers=headers)
ret = r.text
if "找不到该URL" in ret:
return 0
if "找到相关结果约" in ret:
return 1
return 403
#搜狗收录判断
def get_sogou(url):
myurl = "http://www.sogou.com/web?query=%s" % url
headers = {"User-Agent": get_rand_ua()}
r = requests.get(myurl, headers=headers)
ret = r.text
if "您是不是想直接访问" in ret:
return 0
if "找到约" in ret:
return 1
elif "verify_page" in ret:
return 403
return 403
# 定义web服务,路由
urls = (
'/baidu/(.*)', 'baidu',
'/so/(.*)', 'so',
'/sougou/(.*)', 'sougou',
)
app = web.application(urls, globals())
class baidu:
def GET(self, url):
baiduURL = 'http://www.baidu.com/s?wd=%s&tn=json' % url.strip()
r = baidu_html(baiduURL)
r2 = get_baidu_html(r, url)
return r2
class so:
def GET(self, url):
r = get_so(url.strip())
return r
class sougou:
def GET(self, url):
r = get_sogou(url.strip())
return r
if __name__ == "__main__":
app.run()
================================
|