site_tests/platform_PrinterPpds/helpers.py

# Copyright 2018 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import json
import hashlib
import os
import requests

# ==================== Documents digests

def _read_lines_with_prefix(document, position, prefix):
    """
    Starting from given position, it parses from the document complete lines
    (with '\n' character at the end) starting from given prefix. Parser stops
    on the first line that does not start from the given prefix or when there
    are no more '\n' characters in the file.

    @param document: a document to parse
    @param position: an offset in the document to start from

    @returns a pair (lines, position), where the first element is a list of
        parsed lines (with '\n' character at the end) and the second element
        is a new offset in the document, pointing at the first character after
        the last parsed line

    """
    lines = []
    while document.startswith(prefix, position):
        position_next_line = document.find(b'\n', position + len(prefix))
        if position_next_line < 0:
            break
        position_next_line += 1  # to eat '\n' character
        lines.append(document[position:position_next_line])
        position = position_next_line
    return lines, position


def _process_PJL_headers(doc, position, out):
    """
    The function tries to find a PJL headers in given document and process
    them as it was described in _normalize_document(doc) function.

    @param doc: see the description of _normalize_document(doc)
    @param position: offset in the document; defines part of the document that
            is already processed; searching for headers starts from this
            position
    @param out: already processed part of the document (from the beginning to
            the given position)

    @returns new position and output; the position is set at the end of the last
            processed PJL header or it is a copy of of input position, if no PJL
            headers have been foound; the output is adjusted accordingly.

    """
    PJL_MARKER = b'\x1B%-12345X'
    MARGIN = 2048  # max distance to the header
    position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
    while position_pjl >= 0:
        out += doc[position:(position_pjl+len(PJL_MARKER))]
        position = position_pjl + len(PJL_MARKER)
        # parse header and filter problematic lines
        lines, position = _read_lines_with_prefix(doc, position, b'@PJL')
        for line in lines:
            if not (line.startswith(b'@PJL SET ')
                    or line.startswith(b'@PJL COMMENT')
                    or line.startswith(b'@PJL DMINFO')
                    or line.startswith(b'@PJL JOB NAME')
                    or line.startswith(b'@PJL JOBNAME')):
                out += line
        # try to find next PJL header
        position_pjl = doc.find(PJL_MARKER, position, position + MARGIN)
    return position, out


def _process_PS_Adobe_headers(doc, position, out):
    """
    The function tries to find a PS-Adobe headers in given document and process
    them as it was described in _normalize_document(doc) function.

    @param doc: see the description of _normalize_document(doc)
    @param position: offset in the document; defines part of the document that
            is already processed; searching for headers starts from this
            position
    @param out: already processed part of the document (from the beginning to
            the given position)

    @returns new position and output; the position is set at the end of the last
            processed PS-Adobe header or it is a copy of of input position, if
            no PS-Adobe headers have been foound; the output is adjusted
            accordingly.

    """
    PS_MARKER = b'%!PS-Adobe'
    MARGIN = 2048  # max distance to the header
    position_ps = doc.find(PS_MARKER, position, position + MARGIN)
    while position_ps >= 0:
        # add everything till the end of the first line in the header
        position_next_line = doc.find(b'\n', position_ps + len(PS_MARKER))
        if position_next_line < 0:
            break  # no more '\n', we finish the parsing here
        position_next_line += 1 # to eat \n character
        out += doc[position:position_next_line]
        # parse the rest of the header and filter problematic lines
        lines, position = _read_lines_with_prefix(doc, position_next_line,
                                                  b'%')
        for line in lines:
            if not (line.startswith(b'%%Title:')
                    or line.startswith(b'%%For:')):
                out += line
        # search for lines with '{setuserinfo}' or '/JobInfo <<'
        position_ps = doc.find(PS_MARKER, position, position + MARGIN)
        position_ui = doc.find(b'{setuserinfo}', position, position + MARGIN)
        position_ji = doc.find(b'/JobInfo <<', position, position + MARGIN)
        # if '/JobInfo <<' was found, move the offset to the end of the section
        if position_ji >= 0:
            position_ji = doc.find(b'>>', position_ji)
        # if the beginning of the next header was found, make sure that
        # detected sections do not belong to the next header
        if position_ps >= 0:
            if position_ji > position_ps:
                position_ji = -1
            if position_ui > position_ps:
                position_ui = -1
        # choose the farthest section
        position_end = max(position_ji, position_ui)
        if position_end >= 0:
            # find the first '\n' after the farthest section
            position_end = doc.find(b'\n', position_end)
            if position_end < 0:
                break  # no more '\n', we finish the parsing here
            # split into lines everything from here to the end of the section
            lines = doc[position:position_end].split(b'\n')
            position = position_end + 1  # +1 is needed to eat the last \n
            # filter problematic lines
            for line in lines:
                if not (line.find(b'{setuserinfo}') >= 0 or
                        line.find(b'/UserID') >= 0 or line.find(b'/Time') >= 0
                        or line.find(b'/HostLoginName') >= 0
                        or line.find(b'/HostName') >= 0):
                    out += line + b'\n'
            # go to the next iteration, position_ps is already set
    return position, out


def _normalize_LIDIL(doc):
    """
    The function tries to proces given document as it was described in
    _normalize_document(doc) function, but assuming that the document is in
    LIDIL format. This format is used by some HP printers.

    @param doc: see the description of _normalize_document(doc)

    @returns None if the give ndocument is not in LIDIL format. Otherwise, it
        returns a result for _normalize_document(doc) function.

    """
    LIDIL_MARKER = b'\x24\x01\x00\x00\x07\x00\x00\x00'
    LIDIL_JOBID_1_OFF = 2348 # first job id, offset from the beginning
    LIDIL_JOBID_2_OFF = 2339 # second job id, offset from the end
    JOBID_SIZE = 4 # number of bytes used to store job id
    # the document is in LIDIL format <=> it starts with the marker
    if not doc.startswith(LIDIL_MARKER):
        return None
    # remove both JOB IDs and exit
    nd = len(doc)
    if nd > LIDIL_JOBID_1_OFF + LIDIL_JOBID_2_OFF + 2*JOBID_SIZE:
        doc = b''.join([
                doc[:(LIDIL_JOBID_1_OFF)],
                doc[(LIDIL_JOBID_1_OFF + JOBID_SIZE):(nd - LIDIL_JOBID_2_OFF)],
                doc[(nd - LIDIL_JOBID_2_OFF + JOBID_SIZE):]
        ])
    return doc


def _normalize_EJL(doc):
    """
    The function tries to proces given document as it was described in
    _normalize_document(doc) function, but assuming that the document is in
    EJL format.

    @param doc: see the description of _normalize_document(doc)

    @returns None if the give ndocument is not in EJL format. Otherwise, it
        returns a result for _normalize_document(doc) function.

    """
    # EJL - some epson printers (like eplaser)
    EJL_MARKER = b'\x1B\x01@EJL \n'
    # the document is in EJL format <=> it starts with the marker
    if not doc.startswith(EJL_MARKER):
        return None
    # copy the document to output; filter lines parsed from the EJL header
    out = EJL_MARKER
    lines, position = _read_lines_with_prefix(doc, len(EJL_MARKER), b'@EJL')
    for line in lines:
        if not (line.startswith(b'@EJL JI ID=')
                or line.startswith(b'@EJL JI USER=')):
            out += line
    # add the rest of the document and exit
    out += doc[position:]
    return out


def _normalize_document(doc):
    """
    The input document is a raw package sent to printer. This function removes
    from it all variables that can change, when the same content is printed.
    That includes, but is not limited to: user name, host name, job id, date,
    time.

    @param doc: a raw document sent directly to printer to be printed (bytes)

    @returns a copy of doc (bytes) with removed fragments that can vary between
        printing jobs. The returned output is supposed to be identical for the
        same input content send to the pipeline for the same PPD file.

    """
    # Try to parse the document as LIDIL or EJL and exit if successful.
    out = _normalize_LIDIL(doc)
    if out is not None:
        return out
    out = _normalize_EJL(doc)
    if out is not None:
        return out

    # Try to parse and process PJL and PS headers.
    position = 0
    out = b''
    position, out = _process_PJL_headers(doc, position, out)
    position, out = _process_PS_Adobe_headers(doc, position, out)

    # Go to the tail of the document, add the skipped content to the output.
    if position + 2048 < len(doc):
        position_tail = len(doc) - 2048
        out += doc[position:position_tail]
        position = position_tail

    # Try to find 'trailer << '.
    position_trailer = doc.find(b'trailer << ', position)
    if position_trailer >= 0:
        # If found, prune the line with it.
        position_end = doc.find(b'\n', position_trailer)
        if position_end >= 0:
            out += doc[position:position_trailer]
            position = position_end + 1  # +1 to ommit '\n' from the trailer

    # Add the rest of the document to the output.
    out += doc[position:]

    return out


def calculate_digest(doc):
    """
    Calculates digests for given document.

    @param doc: document's content (bytes)

    @returns calculated digests as a string of hexadecimals

    """
    # Prune the variable parts of the document
    out = _normalize_document(doc)

    # Calculates hash
    return hashlib.md5(out).hexdigest()


def parse_digests_file(path_digests, denylist):
    """
    Parses digests and outputs sizes from file.

    @param path_digests: a path to a file with digests
    @param denylist: list of keys to omit

    @returns two dictionaries, both indexed by ppd filenames: the first one
            contains digests, the second one contains output sizes; returns
            empty dictionaries if the given file does not exist

    """
    digests = dict()
    sizes = dict()
    denylist = set(denylist)
    if os.path.isfile(path_digests):
        with open(path_digests, 'r') as file_digests:
            lines = file_digests.read().splitlines()
            for line in lines:
                cols = line.split()
                if len(cols) >= 2 and cols[0] not in denylist:
                    digests[cols[0]] = cols[1]
                    if len(cols) > 2 and len(cols[2]) > 0:
                        sizes[cols[0]] = int(cols[2])
    return digests, sizes


def save_digests_file(path_digests, digests, sizes, denylist):
    """
    Saves list of digests and output sizes to file.

    @param digests: dictionary with digests (keys are names)
    @param sizes: dictionary with outputs sizes (keys are names)
    @param denylist: list of keys to ignore

    @return a content of digests file

    """
    digests_content = ''
    names = sorted(set(digests.keys()).difference(denylist))
    for name in names:
        digest = digests[name]
        assert name.find('\t') < 0 and name.find('\n') < 0
        assert digest.find('\t') < 0 and digest.find('\n') < 0
        digests_content += name + '\t' + digest
        if name in sizes:
            assert isinstance(sizes[name], int)
            digests_content += '\t' + str(sizes[name])
        digests_content += '\n'

    with open(path_digests, 'wb') as file_digests:
        file_digests.write(digests_content.encode("utf-8"))


def load_lines_from_file(path):
    """
    Loads strings stored in the given file as separated lines.

    This routine returns lines read from the given file. All leading and
    trailing whitespace characters in each line are removed. Lines consisting of
    whitespace characters only are skipped.

    @param path: a path to the input file

    @returns a list of non-empty strings

    """
    with open(path) as input_file:
        lines = input_file.readlines()

    output_list = []
    for entry in lines:
        entry = entry.strip()
        if entry != '':
            output_list.append(entry)

    return output_list


# ===================== PPD files on the SCS server

def get_filenames_from_PPD_index(task_id):
    """
    It downloads an index file from the SCS server and extracts names
    of PPD files from it.

    @param task_id: an order number of an index file to process; this is
            an integer from the interval [0..20)

    @returns a list of PPD filenames (may contain duplicates)

    """
    # calculates a URL of the index file
    url_metadata = 'https://www.gstatic.com/chromeos_printing/metadata_v2/'
    url_ppd_index = url_metadata + ('index-%02d.json' % task_id)
    # donwloads and parses the index file
    request = requests.get(url_ppd_index)
    entries = json.loads(request.content)
    # extracts PPD filenames (the second element in each index entry)
    output = []
    for entry in entries:
        output.append(entry[1])
    # returns a list of extracted filenames
    return output


def download_PPD_file(ppd_file):
    """
    It downloads a PPD file from the SCS server.

    @param ppd_file: a filename of PPD file (neither path nor URL)

    @returns content of the PPD file
    """
    url_ppds = 'https://www.gstatic.com/chromeos_printing/ppds/'
    request = requests.get(url_ppds + ppd_file)
    return request.content


# ==================== Local filesystem

def list_entries_from_directory(
        path,
        with_suffixes=None, nonempty_results=False,
        include_files=True, include_directories=True ):
    """
    It returns all filenames from given directory. Results may be filtered
    by filenames suffixes or entries types.

    @param path: a path to directory to list files from
    @param with_suffixes: if set, only entries with given suffixes are
            returned; it must be a tuple
    @param nonempty_results: if True then Exception is raised if there is no
            results
    @param include_files: if False, then regular files and links are omitted
    @param include_directories: if False, directories are omitted

    @returns a nonempty list of entries meeting given criteria

    @raises Exception if no matching filenames were found and
            nonempty_results is set to True

    """
    # lists all files from the directory and filter them by given criteria
    list_of_files = []
    for filename in os.listdir(path):
        path_entry = os.path.join(path, filename)
        # check type
        if os.path.isfile(path_entry):
            if not include_files:
                continue
        elif os.path.isdir(path_entry):
            if not include_directories:
                continue
        else:
            continue
        # check suffix
        if with_suffixes is not None:
            if not filename.endswith(with_suffixes):
                continue
        list_of_files.append(filename)
    # throws exception if no files were found
    if nonempty_results and len(list_of_files) == 0:
        message = 'Directory %s does not contain any ' % path
        message += 'entries meeting the criteria'
        raise Exception(message)
    # returns a non-empty list
    return list_of_files