| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512 |
- #!/usr/bin/env python
- # Copyright 2012 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Retrieve web resources over http."""
- import copy
- import datetime
- import httplib
- import logging
- import random
- import ssl
- import StringIO
- import httparchive
- import platformsettings
- import script_injector
- # PIL isn't always available, but we still want to be able to run without
- # the image scrambling functionality in this case.
- try:
- import Image
- except ImportError:
- Image = None
- TIMER = platformsettings.timer
- class HttpClientException(Exception):
- """Base class for all exceptions in httpclient."""
- pass
- def _InjectScripts(response, injector):
- """Injects script generated by |injector| immediately after <head> or <html>.
- Copies |response| if it is modified.
- Args:
- response: an ArchivedHttpResponse
- injector: function which generates JavaScript string
- based on recording time (e.g. "Math.random = function(){...}")
- Returns:
- an ArchivedHttpResponse
- """
- if type(response) == tuple:
- logging.warn('tuple response: %s', response)
- content_type = response.get_header('content-type')
- if content_type and content_type.startswith('text/html'):
- text_chunks = response.get_data_as_chunks()
- text_chunks, just_injected = script_injector.InjectScript(
- text_chunks, 'text/html', injector(response.request_time))
- if just_injected:
- response = copy.deepcopy(response)
- response.set_data_from_chunks(text_chunks)
- return response
- def _ScrambleImages(response):
- """If the |response| is an image, attempt to scramble it.
- Copies |response| if it is modified.
- Args:
- response: an ArchivedHttpResponse
- Returns:
- an ArchivedHttpResponse
- """
- assert Image, '--scramble_images requires the PIL module to be installed.'
- content_type = response.get_header('content-type')
- if content_type and content_type.startswith('image/'):
- try:
- image_data = response.response_data[0]
- image_data.decode(encoding='base64')
- im = Image.open(StringIO.StringIO(image_data))
- pixel_data = list(im.getdata())
- random.shuffle(pixel_data)
- scrambled_image = im.copy()
- scrambled_image.putdata(pixel_data)
- output_image_io = StringIO.StringIO()
- scrambled_image.save(output_image_io, im.format)
- output_image_data = output_image_io.getvalue()
- output_image_data.encode(encoding='base64')
- response = copy.deepcopy(response)
- response.set_data(output_image_data)
- except Exception:
- pass
- return response
- class DetailedHTTPResponse(httplib.HTTPResponse):
- """Preserve details relevant to replaying responses.
- WARNING: This code uses attributes and methods of HTTPResponse
- that are not part of the public interface.
- """
- def read_chunks(self):
- """Return the response body content and timing data.
- The returned chunks have the chunk size and CRLFs stripped off.
- If the response was compressed, the returned data is still compressed.
- Returns:
- (chunks, delays)
- chunks:
- [response_body] # non-chunked responses
- [chunk_1, chunk_2, ...] # chunked responses
- delays:
- [0] # non-chunked responses
- [chunk_1_first_byte_delay, ...] # chunked responses
- The delay for the first body item should be recorded by the caller.
- """
- buf = []
- chunks = []
- delays = []
- if not self.chunked:
- chunks.append(self.read())
- delays.append(0)
- else:
- start = TIMER()
- try:
- while True:
- line = self.fp.readline()
- chunk_size = self._read_chunk_size(line)
- if chunk_size is None:
- raise httplib.IncompleteRead(''.join(chunks))
- if chunk_size == 0:
- break
- delays.append(TIMER() - start)
- chunks.append(self._safe_read(chunk_size))
- self._safe_read(2) # skip the CRLF at the end of the chunk
- start = TIMER()
- # Ignore any trailers.
- while True:
- line = self.fp.readline()
- if not line or line == '\r\n':
- break
- finally:
- self.close()
- return chunks, delays
- @classmethod
- def _read_chunk_size(cls, line):
- chunk_extensions_pos = line.find(';')
- if chunk_extensions_pos != -1:
- line = line[:chunk_extensions_pos] # strip chunk-extensions
- try:
- chunk_size = int(line, 16)
- except ValueError:
- return None
- return chunk_size
- class DetailedHTTPConnection(httplib.HTTPConnection):
- """Preserve details relevant to replaying connections."""
- response_class = DetailedHTTPResponse
- class DetailedHTTPSResponse(DetailedHTTPResponse):
- """Preserve details relevant to replaying SSL responses."""
- pass
- class DetailedHTTPSConnection(httplib.HTTPSConnection):
- """Preserve details relevant to replaying SSL connections."""
- response_class = DetailedHTTPSResponse
- def __init__(self, host, port):
- # https://www.python.org/dev/peps/pep-0476/#opting-out
- if hasattr(ssl, '_create_unverified_context'):
- httplib.HTTPSConnection.__init__(
- self, host=host, port=port, context=ssl._create_unverified_context())
- else:
- httplib.HTTPSConnection.__init__(self, host=host, port=port)
- class RealHttpFetch(object):
- def __init__(self, real_dns_lookup):
- """Initialize RealHttpFetch.
- Args:
- real_dns_lookup: a function that resolves a host to an IP. RealHttpFetch
- will resolve host name to the IP before making fetching request if this
- is not None.
- """
- self._real_dns_lookup = real_dns_lookup
- @staticmethod
- def _GetHeaderNameValue(header):
- """Parse the header line and return a name/value tuple.
- Args:
- header: a string for a header such as "Content-Length: 314".
- Returns:
- A tuple (header_name, header_value) on success or None if the header
- is not in expected format. header_name is in lowercase.
- """
- i = header.find(':')
- if i > 0:
- return (header[:i].lower(), header[i+1:].strip())
- return None
- @staticmethod
- def _ToTuples(headers):
- """Parse headers and save them to a list of tuples.
- This method takes HttpResponse.msg.headers as input and convert it
- to a list of (header_name, header_value) tuples.
- HttpResponse.msg.headers is a list of strings where each string
- represents either a header or a continuation line of a header.
- 1. a normal header consists of two parts which are separated by colon :
- "header_name:header_value..."
- 2. a continuation line is a string starting with whitespace
- "[whitespace]continued_header_value..."
- If a header is not in good shape or an unexpected continuation line is
- seen, it will be ignored.
- Should avoid using response.getheaders() directly
- because response.getheaders() can't handle multiple headers
- with the same name properly. Instead, parse the
- response.msg.headers using this method to get all headers.
- Args:
- headers: an instance of HttpResponse.msg.headers.
- Returns:
- A list of tuples which looks like:
- [(header_name, header_value), (header_name2, header_value2)...]
- """
- all_headers = []
- for line in headers:
- if line[0] in '\t ':
- if not all_headers:
- logging.warning(
- 'Unexpected response header continuation line [%s]', line)
- continue
- name, value = all_headers.pop()
- value += '\n ' + line.strip()
- else:
- name_value = RealHttpFetch._GetHeaderNameValue(line)
- if not name_value:
- logging.warning(
- 'Response header in wrong format [%s]', line)
- continue
- name, value = name_value # pylint: disable=unpacking-non-sequence
- all_headers.append((name, value))
- return all_headers
- @staticmethod
- def _get_request_host_port(request):
- host_parts = request.host.split(':')
- host = host_parts[0]
- port = int(host_parts[1]) if len(host_parts) == 2 else None
- return host, port
- @staticmethod
- def _get_system_proxy(is_ssl):
- return platformsettings.get_system_proxy(is_ssl)
- def _get_connection(self, request_host, request_port, is_ssl):
- """Return a detailed connection object for host/port pair.
- If a system proxy is defined (see platformsettings.py), it will be used.
- Args:
- request_host: a host string (e.g. "www.example.com").
- request_port: a port integer (e.g. 8080) or None (for the default port).
- is_ssl: True if HTTPS connection is needed.
- Returns:
- A DetailedHTTPSConnection or DetailedHTTPConnection instance.
- """
- connection_host = request_host
- connection_port = request_port
- system_proxy = self._get_system_proxy(is_ssl)
- if system_proxy:
- connection_host = system_proxy.host
- connection_port = system_proxy.port
- # Use an IP address because WPR may override DNS settings.
- if self._real_dns_lookup:
- connection_ip = self._real_dns_lookup(connection_host)
- if not connection_ip:
- logging.critical(
- 'Unable to find IP for host name: %s', connection_host)
- return None
- connection_host = connection_ip
- if is_ssl:
- connection = DetailedHTTPSConnection(connection_host, connection_port)
- if system_proxy:
- connection.set_tunnel(request_host, request_port)
- else:
- connection = DetailedHTTPConnection(connection_host, connection_port)
- return connection
- def __call__(self, request):
- """Fetch an HTTP request.
- Args:
- request: an ArchivedHttpRequest
- Returns:
- an ArchivedHttpResponse
- """
- logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
- request_host, request_port = self._get_request_host_port(request)
- retries = 3
- while True:
- try:
- request_time = datetime.datetime.utcnow()
- connection = self._get_connection(
- request_host, request_port, request.is_ssl)
- connect_start = TIMER()
- connection.connect()
- connect_delay = int((TIMER() - connect_start) * 1000)
- start = TIMER()
- connection.request(
- request.command,
- request.full_path,
- request.request_body,
- request.headers)
- response = connection.getresponse()
- headers_delay = int((TIMER() - start) * 1000)
- chunks, chunk_delays = response.read_chunks()
- delays = {
- 'connect': connect_delay,
- 'headers': headers_delay,
- 'data': chunk_delays
- }
- archived_http_response = httparchive.ArchivedHttpResponse(
- response.version,
- response.status,
- response.reason,
- RealHttpFetch._ToTuples(response.msg.headers),
- chunks,
- delays,
- request_time)
- return archived_http_response
- except Exception, e:
- if retries:
- retries -= 1
- logging.warning('Retrying fetch %s: %s', request, repr(e))
- continue
- logging.critical('Could not fetch %s: %s', request, repr(e))
- return None
- class RecordHttpArchiveFetch(object):
- """Make real HTTP fetches and save responses in the given HttpArchive."""
- def __init__(self, http_archive, injector):
- """Initialize RecordHttpArchiveFetch.
- Args:
- http_archive: an instance of a HttpArchive
- injector: script injector to inject scripts in all pages
- """
- self.http_archive = http_archive
- # Do not resolve host name to IP when recording to avoid SSL3 handshake
- # failure.
- # See https://github.com/chromium/web-page-replay/issues/73 for details.
- self.real_http_fetch = RealHttpFetch(real_dns_lookup=None)
- self.injector = injector
- def __call__(self, request):
- """Fetch the request and return the response.
- Args:
- request: an ArchivedHttpRequest.
- Returns:
- an ArchivedHttpResponse
- """
- # If request is already in the archive, return the archived response.
- if request in self.http_archive:
- logging.debug('Repeated request found: %s', request)
- response = self.http_archive[request]
- else:
- response = self.real_http_fetch(request)
- if response is None:
- return None
- self.http_archive[request] = response
- if self.injector:
- response = _InjectScripts(response, self.injector)
- logging.debug('Recorded: %s', request)
- return response
- class ReplayHttpArchiveFetch(object):
- """Serve responses from the given HttpArchive."""
- def __init__(self, http_archive, real_dns_lookup, injector,
- use_diff_on_unknown_requests=False,
- use_closest_match=False, scramble_images=False):
- """Initialize ReplayHttpArchiveFetch.
- Args:
- http_archive: an instance of a HttpArchive
- real_dns_lookup: a function that resolves a host to an IP.
- injector: script injector to inject scripts in all pages
- use_diff_on_unknown_requests: If True, log unknown requests
- with a diff to requests that look similar.
- use_closest_match: If True, on replay mode, serve the closest match
- in the archive instead of giving a 404.
- """
- self.http_archive = http_archive
- self.injector = injector
- self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
- self.use_closest_match = use_closest_match
- self.scramble_images = scramble_images
- self.real_http_fetch = RealHttpFetch(real_dns_lookup)
- def __call__(self, request):
- """Fetch the request and return the response.
- Args:
- request: an instance of an ArchivedHttpRequest.
- Returns:
- Instance of ArchivedHttpResponse (if found) or None
- """
- if request.host.startswith('127.0.0.1:'):
- return self.real_http_fetch(request)
- response = self.http_archive.get(request)
- if self.use_closest_match and not response:
- closest_request = self.http_archive.find_closest_request(
- request, use_path=True)
- if closest_request:
- response = self.http_archive.get(closest_request)
- if response:
- logging.info('Request not found: %s\nUsing closest match: %s',
- request, closest_request)
- if not response:
- reason = str(request)
- if self.use_diff_on_unknown_requests:
- diff = self.http_archive.diff(request)
- if diff:
- reason += (
- "\nNearest request diff "
- "('-' for archived request, '+' for current request):\n%s" % diff)
- logging.warning('Could not replay: %s', reason)
- else:
- if self.injector:
- response = _InjectScripts(response, self.injector)
- if self.scramble_images:
- response = _ScrambleImages(response)
- return response
- class ControllableHttpArchiveFetch(object):
- """Controllable fetch function that can swap between record and replay."""
- def __init__(self, http_archive, real_dns_lookup,
- injector, use_diff_on_unknown_requests,
- use_record_mode, use_closest_match, scramble_images):
- """Initialize HttpArchiveFetch.
- Args:
- http_archive: an instance of a HttpArchive
- real_dns_lookup: a function that resolves a host to an IP.
- injector: function to inject scripts in all pages.
- takes recording time as datetime.datetime object.
- use_diff_on_unknown_requests: If True, log unknown requests
- with a diff to requests that look similar.
- use_record_mode: If True, start in server in record mode.
- use_closest_match: If True, on replay mode, serve the closest match
- in the archive instead of giving a 404.
- """
- self.http_archive = http_archive
- self.record_fetch = RecordHttpArchiveFetch(http_archive, injector)
- self.replay_fetch = ReplayHttpArchiveFetch(
- http_archive, real_dns_lookup, injector,
- use_diff_on_unknown_requests, use_closest_match, scramble_images)
- if use_record_mode:
- self.SetRecordMode()
- else:
- self.SetReplayMode()
- def SetRecordMode(self):
- self.fetch = self.record_fetch
- self.is_record_mode = True
- def SetReplayMode(self):
- self.fetch = self.replay_fetch
- self.is_record_mode = False
- def __call__(self, *args, **kwargs):
- """Forward calls to Replay/Record fetch functions depending on mode."""
- return self.fetch(*args, **kwargs)
|