Source code for page_exporter.utils

from django.utils import six
import os
import logging
import subprocess
from tempfile import NamedTemporaryFile
import json
from mimetypes import guess_type, guess_all_extensions
from django.core.exceptions import ValidationError
from django.core.urlresolvers import reverse
from django.core.validators import URLValidator
from page_exporter.config import conf


logger = logging.getLogger(__name__)


[docs]class UnsupportedImageFormat(Exception): pass
[docs]class CaptureError(Exception): pass
[docs]def phantomjs_command_kwargs(): """ will construct kwargs for cmd """ kwargs = { 'stdout': subprocess.PIPE, 'stderr': subprocess.PIPE, 'universal_newlines': True } phantom_js_cmd = conf.PHANTOMJS_CMD if phantom_js_cmd: path = '{0}:{1}'.format(os.getenv('PATH', ''), phantom_js_cmd) kwargs.update({'env': {'PATH': path}}) return kwargs
[docs]def phantomjs_command(): cmd = conf.PHANTOMJS_CMD cmd = [cmd] # Concatenate with capture script app_path = os.path.dirname(__file__) capture = conf.CAPTURE_SCRIPT if capture.startswith('./'): capture = os.path.join(app_path, 'scripts', capture) assert os.path.exists(capture), 'Cannot find %s' % capture return cmd + [capture]
PHANTOMJS_CMD = phantomjs_command()
[docs]def page_capture(stream, url, method=None, width=None, height=None, selector=None, data=None, waitfor=None, size=None, crop=None, render='png', wait=None, cookie_name=None, cookie_value=None, cookie_domain=None, page_status=None): """ Captures web pages using ``phantomjs`` """ if isinstance(stream, six.string_types): output = stream else: with NamedTemporaryFile('wb+', suffix='.%s' % render, delete=False) as f: output = f.name try: cmd = PHANTOMJS_CMD + [url, output] # Extra command-line options cmd += ['--format=%s' % render] if method: cmd += ['--method=%s' % method] if width: cmd += ['--width=%s' % width] if height: cmd += ['--height=%s' % height] if selector: cmd += ['--selector=%s' % selector] if data: cmd += ['--data="%s"' % json.dumps(data)] if waitfor: cmd += ['--waitfor=%s' % waitfor] if wait: cmd += ['--wait=%s' % wait] if cookie_name: cmd += ['--cookie_name=%s' % cookie_name] if cookie_value: cmd += ['--cookie_value=%s' % cookie_value] if cookie_domain: cmd += ['--cookie_domain=%s' % cookie_domain] if page_status: cmd += ['--page_status=%s' % page_status] logger.debug(cmd) # Run PhantomJS process proc = subprocess.Popen(cmd, **phantomjs_command_kwargs()) stdout = proc.communicate()[0] rc = proc.returncode if rc > 0: raise CaptureError(stdout) process_phantomjs_stdout(stdout) size = parse_size(size) render = parse_render(render) if size or (render and render != 'png' and render != 'pdf'): # pdf isn't an image, therefore we can't postprocess it. image_postprocess(output, stream, size, crop, render) else: if stream != output: # From file to stream with open(output, 'rb') as out: stream.write(out.read()) stream.flush() finally: if stream != output: os.unlink(output)
[docs]def process_phantomjs_stdout(stdout): """Parse and digest capture script output. """ for line in stdout.splitlines(): bits = line.split(':', 1) if len(bits) < 2: bits = ('INFO', bits) level, msg = bits if level == 'FATAL': logger.fatal(msg) raise CaptureError(msg) elif level == 'ERROR': logger.error(msg) else: logger.info(msg)
[docs]def image_mimetype(render): render = parse_render(render) # All most web browsers don't support 'image/x-ms-bmp'. if render == 'bmp': return 'image/bmp' return guess_type('foo.%s' % render)[0]
[docs]def parse_url(request, url): """Parse url URL parameter.""" try: validate = URLValidator() validate(url) except ValidationError: if url.startswith('/'): host = request.get_host() scheme = 'https' if request.is_secure() else 'http' url = '{scheme}://{host}{uri}'.format(scheme=scheme, host=host, uri=url) else: url = request.build_absolute_uri(reverse(url)) return url
[docs]def parse_render(render): formats = { 'jpeg': guess_all_extensions('image/jpeg'), 'png': guess_all_extensions('image/png'), 'gif': guess_all_extensions('image/gif'), 'bmp': guess_all_extensions('image/x-ms-bmp'), 'tiff': guess_all_extensions('image/tiff'), 'xbm': guess_all_extensions('image/x-xbitmap'), 'pdf': guess_all_extensions('application/pdf') } if not render: render = 'png' else: render = render.lower() for k, v in formats.items(): if '.%s' % render in v: render = k break else: render = 'png' return render
[docs]def parse_size(size_raw): try: width_str, height_str = size_raw.lower().split('x') except AttributeError: size = None except ValueError: size = None else: try: width = int(width_str) assert width > 0 except (ValueError, AssertionError): width = None try: height = int(height_str) assert height > 0 except (ValueError, AssertionError): height = None size = width, height if not all(size): size = None return size
[docs]def image_postprocess(imagefile, output, size, crop, render): """ Resize and crop captured image, and saves to output. (can be stream or filename) """ try: from PIL import Image except ImportError: import Image img = Image.open(imagefile) size_crop = None img_resized = img if size and crop and crop.lower() == 'true': width_raw, height_raw = img.size width, height = size height_better = int(height_raw * (float(width) / width_raw)) if height < height_better: size_crop = (0, 0, width, height) try: if size_crop: size_better = width, height_better img_better = img.resize(size_better, Image.ANTIALIAS) img_resized = img_better.crop(size_crop) elif size: img_resized = img.resize(size, Image.ANTIALIAS) # If save with 'bmp' use default mode('RGBA'), it will raise: # "IOError: cannot write mode RGBA as BMP". # So, we need convert image mode # from 'RGBA' to 'RGB' for 'bmp' format. if render == 'bmp': img_resized = img_resized.convert('RGB') # Fix IOError: cannot write mode RGBA as XBM elif render == 'xbm': img_resized = img_resized.convert('1') # Works with either filename or file-like object img_resized.save(output, render) except KeyError: raise UnsupportedImageFormat except IOError as e: raise CaptureError(e)