httpclient.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. #!/usr/bin/env python
  2. # Copyright 2012 Google Inc. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Retrieve web resources over http."""
  16. import copy
  17. import datetime
  18. import httplib
  19. import logging
  20. import random
  21. import ssl
  22. import StringIO
  23. import httparchive
  24. import platformsettings
  25. import script_injector
  26. # PIL isn't always available, but we still want to be able to run without
  27. # the image scrambling functionality in this case.
  28. try:
  29. import Image
  30. except ImportError:
  31. Image = None
  32. TIMER = platformsettings.timer
  33. class HttpClientException(Exception):
  34. """Base class for all exceptions in httpclient."""
  35. pass
  36. def _InjectScripts(response, injector):
  37. """Injects script generated by |injector| immediately after <head> or <html>.
  38. Copies |response| if it is modified.
  39. Args:
  40. response: an ArchivedHttpResponse
  41. injector: function which generates JavaScript string
  42. based on recording time (e.g. "Math.random = function(){...}")
  43. Returns:
  44. an ArchivedHttpResponse
  45. """
  46. if type(response) == tuple:
  47. logging.warn('tuple response: %s', response)
  48. content_type = response.get_header('content-type')
  49. if content_type and content_type.startswith('text/html'):
  50. text_chunks = response.get_data_as_chunks()
  51. text_chunks, just_injected = script_injector.InjectScript(
  52. text_chunks, 'text/html', injector(response.request_time))
  53. if just_injected:
  54. response = copy.deepcopy(response)
  55. response.set_data_from_chunks(text_chunks)
  56. return response
  57. def _ScrambleImages(response):
  58. """If the |response| is an image, attempt to scramble it.
  59. Copies |response| if it is modified.
  60. Args:
  61. response: an ArchivedHttpResponse
  62. Returns:
  63. an ArchivedHttpResponse
  64. """
  65. assert Image, '--scramble_images requires the PIL module to be installed.'
  66. content_type = response.get_header('content-type')
  67. if content_type and content_type.startswith('image/'):
  68. try:
  69. image_data = response.response_data[0]
  70. image_data.decode(encoding='base64')
  71. im = Image.open(StringIO.StringIO(image_data))
  72. pixel_data = list(im.getdata())
  73. random.shuffle(pixel_data)
  74. scrambled_image = im.copy()
  75. scrambled_image.putdata(pixel_data)
  76. output_image_io = StringIO.StringIO()
  77. scrambled_image.save(output_image_io, im.format)
  78. output_image_data = output_image_io.getvalue()
  79. output_image_data.encode(encoding='base64')
  80. response = copy.deepcopy(response)
  81. response.set_data(output_image_data)
  82. except Exception:
  83. pass
  84. return response
  85. class DetailedHTTPResponse(httplib.HTTPResponse):
  86. """Preserve details relevant to replaying responses.
  87. WARNING: This code uses attributes and methods of HTTPResponse
  88. that are not part of the public interface.
  89. """
  90. def read_chunks(self):
  91. """Return the response body content and timing data.
  92. The returned chunks have the chunk size and CRLFs stripped off.
  93. If the response was compressed, the returned data is still compressed.
  94. Returns:
  95. (chunks, delays)
  96. chunks:
  97. [response_body] # non-chunked responses
  98. [chunk_1, chunk_2, ...] # chunked responses
  99. delays:
  100. [0] # non-chunked responses
  101. [chunk_1_first_byte_delay, ...] # chunked responses
  102. The delay for the first body item should be recorded by the caller.
  103. """
  104. buf = []
  105. chunks = []
  106. delays = []
  107. if not self.chunked:
  108. chunks.append(self.read())
  109. delays.append(0)
  110. else:
  111. start = TIMER()
  112. try:
  113. while True:
  114. line = self.fp.readline()
  115. chunk_size = self._read_chunk_size(line)
  116. if chunk_size is None:
  117. raise httplib.IncompleteRead(''.join(chunks))
  118. if chunk_size == 0:
  119. break
  120. delays.append(TIMER() - start)
  121. chunks.append(self._safe_read(chunk_size))
  122. self._safe_read(2) # skip the CRLF at the end of the chunk
  123. start = TIMER()
  124. # Ignore any trailers.
  125. while True:
  126. line = self.fp.readline()
  127. if not line or line == '\r\n':
  128. break
  129. finally:
  130. self.close()
  131. return chunks, delays
  132. @classmethod
  133. def _read_chunk_size(cls, line):
  134. chunk_extensions_pos = line.find(';')
  135. if chunk_extensions_pos != -1:
  136. line = line[:chunk_extensions_pos] # strip chunk-extensions
  137. try:
  138. chunk_size = int(line, 16)
  139. except ValueError:
  140. return None
  141. return chunk_size
  142. class DetailedHTTPConnection(httplib.HTTPConnection):
  143. """Preserve details relevant to replaying connections."""
  144. response_class = DetailedHTTPResponse
  145. class DetailedHTTPSResponse(DetailedHTTPResponse):
  146. """Preserve details relevant to replaying SSL responses."""
  147. pass
  148. class DetailedHTTPSConnection(httplib.HTTPSConnection):
  149. """Preserve details relevant to replaying SSL connections."""
  150. response_class = DetailedHTTPSResponse
  151. def __init__(self, host, port):
  152. # https://www.python.org/dev/peps/pep-0476/#opting-out
  153. if hasattr(ssl, '_create_unverified_context'):
  154. httplib.HTTPSConnection.__init__(
  155. self, host=host, port=port, context=ssl._create_unverified_context())
  156. else:
  157. httplib.HTTPSConnection.__init__(self, host=host, port=port)
  158. class RealHttpFetch(object):
  159. def __init__(self, real_dns_lookup):
  160. """Initialize RealHttpFetch.
  161. Args:
  162. real_dns_lookup: a function that resolves a host to an IP. RealHttpFetch
  163. will resolve host name to the IP before making fetching request if this
  164. is not None.
  165. """
  166. self._real_dns_lookup = real_dns_lookup
  167. @staticmethod
  168. def _GetHeaderNameValue(header):
  169. """Parse the header line and return a name/value tuple.
  170. Args:
  171. header: a string for a header such as "Content-Length: 314".
  172. Returns:
  173. A tuple (header_name, header_value) on success or None if the header
  174. is not in expected format. header_name is in lowercase.
  175. """
  176. i = header.find(':')
  177. if i > 0:
  178. return (header[:i].lower(), header[i+1:].strip())
  179. return None
  180. @staticmethod
  181. def _ToTuples(headers):
  182. """Parse headers and save them to a list of tuples.
  183. This method takes HttpResponse.msg.headers as input and convert it
  184. to a list of (header_name, header_value) tuples.
  185. HttpResponse.msg.headers is a list of strings where each string
  186. represents either a header or a continuation line of a header.
  187. 1. a normal header consists of two parts which are separated by colon :
  188. "header_name:header_value..."
  189. 2. a continuation line is a string starting with whitespace
  190. "[whitespace]continued_header_value..."
  191. If a header is not in good shape or an unexpected continuation line is
  192. seen, it will be ignored.
  193. Should avoid using response.getheaders() directly
  194. because response.getheaders() can't handle multiple headers
  195. with the same name properly. Instead, parse the
  196. response.msg.headers using this method to get all headers.
  197. Args:
  198. headers: an instance of HttpResponse.msg.headers.
  199. Returns:
  200. A list of tuples which looks like:
  201. [(header_name, header_value), (header_name2, header_value2)...]
  202. """
  203. all_headers = []
  204. for line in headers:
  205. if line[0] in '\t ':
  206. if not all_headers:
  207. logging.warning(
  208. 'Unexpected response header continuation line [%s]', line)
  209. continue
  210. name, value = all_headers.pop()
  211. value += '\n ' + line.strip()
  212. else:
  213. name_value = RealHttpFetch._GetHeaderNameValue(line)
  214. if not name_value:
  215. logging.warning(
  216. 'Response header in wrong format [%s]', line)
  217. continue
  218. name, value = name_value # pylint: disable=unpacking-non-sequence
  219. all_headers.append((name, value))
  220. return all_headers
  221. @staticmethod
  222. def _get_request_host_port(request):
  223. host_parts = request.host.split(':')
  224. host = host_parts[0]
  225. port = int(host_parts[1]) if len(host_parts) == 2 else None
  226. return host, port
  227. @staticmethod
  228. def _get_system_proxy(is_ssl):
  229. return platformsettings.get_system_proxy(is_ssl)
  230. def _get_connection(self, request_host, request_port, is_ssl):
  231. """Return a detailed connection object for host/port pair.
  232. If a system proxy is defined (see platformsettings.py), it will be used.
  233. Args:
  234. request_host: a host string (e.g. "www.example.com").
  235. request_port: a port integer (e.g. 8080) or None (for the default port).
  236. is_ssl: True if HTTPS connection is needed.
  237. Returns:
  238. A DetailedHTTPSConnection or DetailedHTTPConnection instance.
  239. """
  240. connection_host = request_host
  241. connection_port = request_port
  242. system_proxy = self._get_system_proxy(is_ssl)
  243. if system_proxy:
  244. connection_host = system_proxy.host
  245. connection_port = system_proxy.port
  246. # Use an IP address because WPR may override DNS settings.
  247. if self._real_dns_lookup:
  248. connection_ip = self._real_dns_lookup(connection_host)
  249. if not connection_ip:
  250. logging.critical(
  251. 'Unable to find IP for host name: %s', connection_host)
  252. return None
  253. connection_host = connection_ip
  254. if is_ssl:
  255. connection = DetailedHTTPSConnection(connection_host, connection_port)
  256. if system_proxy:
  257. connection.set_tunnel(request_host, request_port)
  258. else:
  259. connection = DetailedHTTPConnection(connection_host, connection_port)
  260. return connection
  261. def __call__(self, request):
  262. """Fetch an HTTP request.
  263. Args:
  264. request: an ArchivedHttpRequest
  265. Returns:
  266. an ArchivedHttpResponse
  267. """
  268. logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
  269. request_host, request_port = self._get_request_host_port(request)
  270. retries = 3
  271. while True:
  272. try:
  273. request_time = datetime.datetime.utcnow()
  274. connection = self._get_connection(
  275. request_host, request_port, request.is_ssl)
  276. connect_start = TIMER()
  277. connection.connect()
  278. connect_delay = int((TIMER() - connect_start) * 1000)
  279. start = TIMER()
  280. connection.request(
  281. request.command,
  282. request.full_path,
  283. request.request_body,
  284. request.headers)
  285. response = connection.getresponse()
  286. headers_delay = int((TIMER() - start) * 1000)
  287. chunks, chunk_delays = response.read_chunks()
  288. delays = {
  289. 'connect': connect_delay,
  290. 'headers': headers_delay,
  291. 'data': chunk_delays
  292. }
  293. archived_http_response = httparchive.ArchivedHttpResponse(
  294. response.version,
  295. response.status,
  296. response.reason,
  297. RealHttpFetch._ToTuples(response.msg.headers),
  298. chunks,
  299. delays,
  300. request_time)
  301. return archived_http_response
  302. except Exception, e:
  303. if retries:
  304. retries -= 1
  305. logging.warning('Retrying fetch %s: %s', request, repr(e))
  306. continue
  307. logging.critical('Could not fetch %s: %s', request, repr(e))
  308. return None
  309. class RecordHttpArchiveFetch(object):
  310. """Make real HTTP fetches and save responses in the given HttpArchive."""
  311. def __init__(self, http_archive, injector):
  312. """Initialize RecordHttpArchiveFetch.
  313. Args:
  314. http_archive: an instance of a HttpArchive
  315. injector: script injector to inject scripts in all pages
  316. """
  317. self.http_archive = http_archive
  318. # Do not resolve host name to IP when recording to avoid SSL3 handshake
  319. # failure.
  320. # See https://github.com/chromium/web-page-replay/issues/73 for details.
  321. self.real_http_fetch = RealHttpFetch(real_dns_lookup=None)
  322. self.injector = injector
  323. def __call__(self, request):
  324. """Fetch the request and return the response.
  325. Args:
  326. request: an ArchivedHttpRequest.
  327. Returns:
  328. an ArchivedHttpResponse
  329. """
  330. # If request is already in the archive, return the archived response.
  331. if request in self.http_archive:
  332. logging.debug('Repeated request found: %s', request)
  333. response = self.http_archive[request]
  334. else:
  335. response = self.real_http_fetch(request)
  336. if response is None:
  337. return None
  338. self.http_archive[request] = response
  339. if self.injector:
  340. response = _InjectScripts(response, self.injector)
  341. logging.debug('Recorded: %s', request)
  342. return response
  343. class ReplayHttpArchiveFetch(object):
  344. """Serve responses from the given HttpArchive."""
  345. def __init__(self, http_archive, real_dns_lookup, injector,
  346. use_diff_on_unknown_requests=False,
  347. use_closest_match=False, scramble_images=False):
  348. """Initialize ReplayHttpArchiveFetch.
  349. Args:
  350. http_archive: an instance of a HttpArchive
  351. real_dns_lookup: a function that resolves a host to an IP.
  352. injector: script injector to inject scripts in all pages
  353. use_diff_on_unknown_requests: If True, log unknown requests
  354. with a diff to requests that look similar.
  355. use_closest_match: If True, on replay mode, serve the closest match
  356. in the archive instead of giving a 404.
  357. """
  358. self.http_archive = http_archive
  359. self.injector = injector
  360. self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
  361. self.use_closest_match = use_closest_match
  362. self.scramble_images = scramble_images
  363. self.real_http_fetch = RealHttpFetch(real_dns_lookup)
  364. def __call__(self, request):
  365. """Fetch the request and return the response.
  366. Args:
  367. request: an instance of an ArchivedHttpRequest.
  368. Returns:
  369. Instance of ArchivedHttpResponse (if found) or None
  370. """
  371. if request.host.startswith('127.0.0.1:'):
  372. return self.real_http_fetch(request)
  373. response = self.http_archive.get(request)
  374. if self.use_closest_match and not response:
  375. closest_request = self.http_archive.find_closest_request(
  376. request, use_path=True)
  377. if closest_request:
  378. response = self.http_archive.get(closest_request)
  379. if response:
  380. logging.info('Request not found: %s\nUsing closest match: %s',
  381. request, closest_request)
  382. if not response:
  383. reason = str(request)
  384. if self.use_diff_on_unknown_requests:
  385. diff = self.http_archive.diff(request)
  386. if diff:
  387. reason += (
  388. "\nNearest request diff "
  389. "('-' for archived request, '+' for current request):\n%s" % diff)
  390. logging.warning('Could not replay: %s', reason)
  391. else:
  392. if self.injector:
  393. response = _InjectScripts(response, self.injector)
  394. if self.scramble_images:
  395. response = _ScrambleImages(response)
  396. return response
  397. class ControllableHttpArchiveFetch(object):
  398. """Controllable fetch function that can swap between record and replay."""
  399. def __init__(self, http_archive, real_dns_lookup,
  400. injector, use_diff_on_unknown_requests,
  401. use_record_mode, use_closest_match, scramble_images):
  402. """Initialize HttpArchiveFetch.
  403. Args:
  404. http_archive: an instance of a HttpArchive
  405. real_dns_lookup: a function that resolves a host to an IP.
  406. injector: function to inject scripts in all pages.
  407. takes recording time as datetime.datetime object.
  408. use_diff_on_unknown_requests: If True, log unknown requests
  409. with a diff to requests that look similar.
  410. use_record_mode: If True, start in server in record mode.
  411. use_closest_match: If True, on replay mode, serve the closest match
  412. in the archive instead of giving a 404.
  413. """
  414. self.http_archive = http_archive
  415. self.record_fetch = RecordHttpArchiveFetch(http_archive, injector)
  416. self.replay_fetch = ReplayHttpArchiveFetch(
  417. http_archive, real_dns_lookup, injector,
  418. use_diff_on_unknown_requests, use_closest_match, scramble_images)
  419. if use_record_mode:
  420. self.SetRecordMode()
  421. else:
  422. self.SetReplayMode()
  423. def SetRecordMode(self):
  424. self.fetch = self.record_fetch
  425. self.is_record_mode = True
  426. def SetReplayMode(self):
  427. self.fetch = self.replay_fetch
  428. self.is_record_mode = False
  429. def __call__(self, *args, **kwargs):
  430. """Forward calls to Replay/Record fetch functions depending on mode."""
  431. return self.fetch(*args, **kwargs)