python-tutorials/networking/urllib2_usage.py at master · feiskyer/python-tutorials · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python
import urllib2
import os
import re
import zlib
import gzip
from StringIO import StringIO

def get_url(url):
	content=''
	response = urllib2.urlopen(url)
	data = response.read()
	if response.info().get('Content-Encoding') == 'gzip':
		buff=StringIO(data)
		f=gzip.GzipFile(fileobj=buff)
		content=f.read()
	elif response.info().get('Content-Encoding') == 'deflate':
		content=zlib.decompress(data, -zlib.MAX_WBITS)
	else:
		content=data

	charset=re.search(r'charset=([\w-]+)', response.headers['content-type'])
	if charset:
		charset=charset.group(1)
	if charset:
		content=content.decode(charset)
	return content

def url_size(url):
	request = urllib2.Request(url)
	request.get_method = lambda: 'HEAD'
	response = urllib2.urlopen(request)
	size = int(response.headers['content-length'])
	return size

def url_save(url, filepath, refer=None):
	# add http headers
	headers = {}
	if refer:
		headers['Referer'] = refer
	request = urllib2.Request(url, headers=headers)
	# add post data
	# urllib2.Request(url, headers=headers,
	# 					data='person=jessinio&gender=male')
	response = urllib2.urlopen(request)
	file_size = int(response.headers['content-length'])
	assert file_size
	if os.path.exists(filepath):
		if file_size == os.path.getsize(filepath):
			print 'Skip %s: file already exists' % os.path.basename(filepath)
			return
		else:
			print 'Overwriting', os.path.basename(filepath), '...'
	with open(filepath, 'wb') as output:
		received = 0
		while True:
			buffer = response.read(1024*256)
			if not buffer:
				break
			received += len(buffer)
			output.write(buffer)
	assert received == file_size == os.path.getsize(filepath), '%s == %s == %s' % (received, file_size, os.path.getsize(filepath))

def show_process(downloaded_blk_count, blk_size, total_size):
	import sys
	per=100.0*downloaded_blk_count*blk_size/total_size
	if per>100:
		per=100
		sys.stdout.write('Downloading %.2f%%\r' % per)
		sys.stdout.write('\n')
	sys.stdout.write('Downloading %.2f%%\r' % per)
	sys.stdout.flush()

def url_save2(url, local):	# use urllib.urlretireve to download file
	import urllib
	urllib.urlretrieve(url,local,show_process)

if __name__=='__main__':
	try:
		#print get_url('http://www.baidu.com')
		print url_size('http://www.baidu.com')
		#url_save2('http://www.baidu.com','baidu.html')
		url_save('http://www.baidu.com','baidu.html')
	except:
		#import traceback
		#import sys
		#traceback.print_exc(file=sys.stdout)
		print 'Can\'t download file '