#!/usr/bin/python3

# Copyright (c) 2009, Morten Kristensen (msk@nullpointer.dk)
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials
#       provided with the distribution.
#     * Neither the name of the Nullpointer.dk nor the names of its
#       contributors may be used to endorse or promote products
#       derived from this software without specific prior written
#       permission.
# 
# THIS SOFTWARE IS PROVIDED BY Morten Kristensen (msk@nullpointer.dk)
# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Morten
# Kristensen (msk@nullpointer.dk) BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# usage: ./md5_evilize.py <mode> <file1> <file2> (<file3> <file4>)
# <mode> can be the following:
#   pack        Then <file1> is the 'good' file and <file2>
#               is the 'evil' file. Two packed files will be
#               created; the 'good' (<file3>) and the 'bad'
#               (<file4>).
#   extract     Then <file1> is the packed input-file and
#               <file2> the output-file.

# The MD5 hash-algorithm is collision prone since the following holds:

# let h1 = md5(x), h1' = md5(x+z), h2 = md5(y), h2' = md5(y+z)
# if h1 == h2
# then h1' == h2'

# This means that if you have a collision (two different strings that
# yield the same hash) then you can produce a new collision
# easily. I'll use this fact to create two files which both has the
# same md5-hash and size. The extract feature will then extract the
# contents of the packed files. If it is the "good" file then the
# "good" code will be extracted and the "bad" code otherwise. This is
# determined by the vector written to the packed files first. Beneath
# are the two vectors used (v1 and v2). They both yield the same
# md5-hash but they are different. The pointers beneath marks where
# they differ.

import sys

v1 = [0xd1, 0x31, 0xdd, 0x02, 0xc5, 0xe6, 0xee, 0xc4, 0x69, 0x3d,
  0x9a, 0x06, 0x98, 0xaf, 0xf9, 0x5c, 0x2f, 0xca, 0xb5, 0x87, # <-
  0x12, 0x46, 0x7e, 0xab, 0x40, 0x04, 0x58, 0x3e, 0xb8, 0xfb, 0x7f,
  0x89, 0x55, 0xad, 0x34, 0x06, 0x09, 0xf4, 0xb3, 0x02, 0x83, 0xe4,
  0x88, 0x83, 0x25, 0x71, 0x41, 0x5a, 0x08, 0x51, 0x25, 0xe8, 0xf7,
  0xcd, 0xc9, 0x9f, 0xd9, 0x1d, 0xbd, 0xf2, 0x80, 0x37, 0x3c, 0x5b,
  0xd8, 0x82, 0x3e, 0x31, 0x56, 0x34, 0x8f, 0x5b, 0xae, 0x6d, 0xac,
  0xd4, 0x36, 0xc9, 0x19, 0xc6, 0xdd, 0x53, 0xe2, 0xb4, 0x87, 0xda,
  0x03, 0xfd, 0x02, 0x39, 0x63, 0x06, 0xd2, 0x48, 0xcd, 0xa0, 0xe9,
  0x9f, 0x33, 0x42, 0x0f, 0x57, 0x7e, 0xe8, 0xce, 0x54, 0xb6, 0x70,
  0x80, 0xa8, 0x0d, 0x1e, 0xc6, 0x98, 0x21, 0xbc, 0xb6, 0xa8, 0x83,
  0x93, 0x96, 0xf9, 0x65, 0x2b, 0x6f, 0xf7, 0x2a, 0x70]

v2 = [0xd1, 0x31, 0xdd, 0x02, 0xc5, 0xe6, 0xee, 0xc4, 0x69, 0x3d,
   0x9a, 0x06, 0x98, 0xaf, 0xf9, 0x5c, 0x2f, 0xca, 0xb5 , 0x07, # <-
   0x12, 0x46, 0x7e, 0xab, 0x40, 0x04, 0x58, 0x3e , 0xb8, 0xfb, 0x7f,
   0x89, 0x55, 0xad, 0x34, 0x06, 0x09, 0xf4, 0xb3, 0x02, 0x83, 0xe4,
   0x88, 0x83, 0x25, 0xf1, # <-
   0x41, 0x5a, 0x08, 0x51, 0x25, 0xe8, 0xf7, 0xcd, 0xc9, 0x9f , 0xd9,
   0x1d, 0xbd, 0x72, # <-
   0x80, 0x37, 0x3c, 0x5b, 0xd8, 0x82, 0x3e, 0x31, 0x56, 0x34, 0x8f,
   0x5b, 0xae, 0x6d, 0xac , 0xd4, 0x36, 0xc9, 0x19, 0xc6, 0xdd, 0x53,
   0xe2, 0x34, # <-
   0x87, 0xda, 0x03, 0xfd, 0x02, 0x39, 0x63, 0x06, 0xd2, 0x48, 0xcd,
   0xa0, 0xe9, 0x9f, 0x33, 0x42, 0x0f, 0x57, 0x7e , 0xe8, 0xce, 0x54,
   0xb6, 0x70, 0x80, 0x28, # <-
   0x0d, 0x1e, 0xc6, 0x98, 0x21, 0xbc, 0xb6, 0xa8, 0x83, 0x93, 0x96 ,
   0xf9, 0x65, 0xab, # <-
   0x6f, 0xf7, 0x2a, 0x70]

def to_bin(vector):
    ba = bytearray()
    for n in vector:
        ba.append(n)
    return ba

def zero_pad(data, block=16):
    length = len(data)
    if length > block:
        raise ValueError("Data size cannot exceed %i bits in representation." % block)
    return "0"*(block-length)+data

def pack(vector, good_data, evil_data, output):
    # first write the vector (128 bytes)
    output.write(to_bin(vector))

    # then write the good and evil data lengths (16 bytes each)
    output.write(zero_pad(hex(len(good_data))[2:]).encode())
    output.write(zero_pad(hex(len(evil_data))[2:]).encode())

    # finally write the good and evil data consecutively
    output.write(good_data)
    output.write(evil_data)

def extract(data):
    if len(data) < 128+16+16+1+1:
        raise ValueError("Data has to consist of at least 128+2x16+1+1 = 162 bytes")

    # extract the vector and the length of the good and bad file
    vec = data[0:128] # 128 bytes
    glen = int(data[128:144], 16) # 16 bytes
    elen = int(data[144:160], 16) # 16 bytes

    # good file
    if vec == to_bin(v1):
        return data[160:160+glen]

    # evil file
    elif vec == to_bin(v2):
        offset = 160+glen
        return data[offset:offset+elen]

def usage():
    print("usage: %s <mode> <file1> <file2> (<file3> <file4>)" % sys.argv[0])
    print("<mode> can be the following:")
    print("  pack        Then <file1> is the 'good' file and <file2>")
    print("              is the 'evil' file. Two packed files will be")
    print("              created; the 'good' (<file3>) and the 'bad'")
    print("              (<file4>).")
    print("  extract     Then <file1> is the packed input-file and")
    print("              <file2> the output-file.")

if __name__ == "__main__":
    args = len(sys.argv)
    if args != 4 and args != 6:
        usage()
        exit(-1)

    mode = sys.argv[1]
    if mode == "pack" and args == 6:
        good_in = open(sys.argv[2], "rb")
        evil_in = open(sys.argv[3], "rb")
        good_out = open(sys.argv[4], "wb")
        evil_out = open(sys.argv[5], "wb")

        gdata = good_in.read()
        edata = evil_in.read()

        # pack data to output files
        pack(v1, gdata, edata, good_out)
        pack(v2, gdata, edata, evil_out)

        good_in.close()
        evil_in.close()
        evil_out.close()
        good_out.close()
    elif mode == "extract" and args == 4:
        f_in = open(sys.argv[2], "rb")
        f_out = open(sys.argv[3], "wb")

        # extract data from packed file and write it to file
        f_out.write(extract(f_in.read()))

        f_in.close()
        f_out.close()
    else:
        usage()
        exit(-1)

