Jiepai icon indicating copy to clipboard operation
Jiepai copied to clipboard

小白看书时候发现parameter里多了timestamp,代码小改动了下

Open liaxiang opened this issue 6 years ago • 1 comments

加了headers, 加了timestamp, 但是跑的时候偶尔出现OSError: [Errno 22] The filename, directory name, or volume label syntax is incorrect,我把这个名字直接自己创业却又是可以的,大神们帮忙看看

import requests import os from urllib.parse import urlencode from hashlib import md5 from multiprocessing.pool import Pool from datetime import datetime

def get_page(offset): timestamp = str(datetime.timestamp(datetime.today())).replace('.', '')[:-3] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/74.0.3729.169 Safari/537.36', 'cookie': 'tt_webid=6705372327364445699; WEATHER_CITY=%E5%8C%97%E4%BA%AC; ' 'UM_distinctid=16b7fbc4c5f2f3-055c8cad207e35-3e385b04-144000-16b7fbc4c601fb;' ' tt_webid=6705372327364445699; csrftoken=565955f383dfff6e64e1fcaf538414be;' ' CNZZDATA1259612802=429684378-1561215529-%7C1561296529; s_v_web_id=4b402c5aa53e24a17fca9d68bd6eb7ff', 'x-requested-with': 'XMLHttpRequest' } params = { 'aid': 24, 'app_name': 'web_search', 'offset': offset, 'format': 'json', 'keyword': '街拍', 'autoload': 'true', 'count': '20', 'en_qc': '1', 'cur_tab': '1', 'from': 'search_tab', 'time': timestamp, } url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params) try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.json() except requests.ConnectionError: return None

def get_images(json_data): if json_data.get('data'): for item in json_data.get('data'): if item.get('cell_type') is not None: continue title = item.get('title') images = item.get('image_list') for image in images: yield{ 'image': image.get('url'), 'title': title }

def save_image(item): if not os.path.exists(item.get('title')): os.mkdir(item.get('title')) try: response = requests.get(item.get('image')) if response.status_code == 200: file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(response.content) else: print('Already Downloaded', file_path) except requests.ConnectionError: print('Failed to save image')

def main(offset): json = get_page(offset) for item in get_images(json): print(item) save_image(item)

GROUP_START = 1 GROUP_END = 20

if name == 'main': pool = Pool() groups = ([x *20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join()

liaxiang avatar Jun 24 '19 07:06 liaxiang

timestamp是时间戳,请求时应该是自动加上去的

zizxzy avatar Aug 31 '19 01:08 zizxzy