import urllib.robotparser import requests import time ''' 使用robots.txt ''' urls = ['http://www.baidu.com', 'http://www.jingdong.com'] # RobotFileParser():可以方便的判断哪些页面可以抓取,哪些页面不可以抓取 rp = urllib.robotparser.RobotFileParser() rp.set_url('https://www.baidu.com/robots.txt') rp.read() print(rp.can_fetch('Googlebot', 'https://www.baidu.com/baidu')) print(rp.can_fetch('Baiduspider', 'https://www.baidu.com/cpro')) def get_data(url, num_retries=3): try: data = requests.get(url, timeout=5) print(data.status_code) except requests.exceptions.ConnectionError as e: print('请求错误, url: ', url) print('错误详情: ', e) data = None except: print('未知错误! url:', url) data = None if (data != None) and (500 <= data.status_code <= 599): if num_retries > 0: print('服务器错误,正在重试...') time.sleep(1) num_retries -= 1 get_data(url, num_retries) return data def robots_check(robots_txt_url, headers, url): rp = urllib.robotparser.RobotFileParser() rp.set_url(robots_txt_url) rp.read() result = rp.can_fetch(headers['User-Agent'], url) return result for url in urls: if robots_check('https://www.baidu.com/robots.txt', {'User-Agent': 'Mozilla/5.0'}, url): data = get_data(url)