#!/usr/bin/env python 

'''
Break repeating-key XOR
~~~~~~~~~~~~~~~~~~~~~~~~

a. Let KEYSIZE be the guessed length of the key; try values from 2 to
(say) 40.

b. Write a function to compute the edit distance/Hamming distance
between two strings. The Hamming distance is just the number of
differing bits. The distance between:

  this is a test

and:

  wokka wokka!!!

is 37.

c. For each KEYSIZE, take the FIRST KEYSIZE worth of bytes, and the
SECOND KEYSIZE worth of bytes, and find the edit distance between
them. Normalize this result by dividing by KEYSIZE.

d. The KEYSIZE with the smallest normalized edit distance is probably
the key. You could proceed perhaps with the smallest 2-3 KEYSIZE
values. Or take 4 KEYSIZE blocks instead of 2 and average the
distances.

e. Now that you probably know the KEYSIZE: break the ciphertext into
blocks of KEYSIZE length.

f. Now transpose the blocks: make a block that is the first byte of
every block, and a block that is the second byte of every block, and
so on.

g. Solve each block as if it was single-character XOR. You already
have code to do this.

h. For each block, the single-byte XOR key that produces the best
looking histogram is the repeating-key XOR key byte for that
block. Put them together and you have the key.
'''

import base64
import sys

# Return the number of different bits netween two 32 bits numbers
def dif(a, b, bits = 32):
    x = a ^ b
    return sum((x>>i & 1) for i in xrange(bits))    

# Compute the edit/Hamming distance between two strings (of equal lengths)
# Returns number of differing bits
def d_ham(s1, s2):
    return sum(dif(ord(s1[i]), ord(s2[i])) for i in xrange(len(s1)))

# Used below for single-char xor decoding (step f.)
def transpose(blocks):
    t = []
    for i in xrange(len(blocks[0])):
        t1 = []
        for j in xrange(len(blocks)):
            t1.append(blocks[j][i])
        t.append("".join(t1))
    return t

charset = "abcdefghijklmnopqrstuvwxyz "
charset_up = charset.upper()

# Simple scoring method based on character frequency
def get_score(text):
    # Score: number of valid characters
    s = 0
    for l in text:
        if (l in charset) or (l in charset_up):
            s += 1

    return (s*1.0)/len(text)

# Decrypt/encrypt function are the same
def rxor(text, key):
    dec = [chr(ord(text[i]) ^ ord(key[i % len(key)])) for i in range(0, len(text))]
    return "".join(dec)

if __name__ == "__main__":
    fname = sys.argv[1]
    f = open(fname, "rb")
    enc = f.read()
    f.close()

    # Find the most probable key sizes
    d = {}
    
    for keysize in range(2, 40):
        blocks = [enc[i:i+keysize] for i in range(0, len(enc), keysize)]
        dist = 0
        # Average Hamming distance over 5 blocks
        for i in range(0,5):
            for j in range(i+1, 5):
                dist += d_ham(blocks[i], blocks[j])*1.0/keysize
        d[keysize] = dist/10

    print "Most possible key sizes: "
    for w in sorted(d, key = lambda k: d[k])[:7]:
        print "Key size:", w, "Hamming distance:", d[w]    

    sys.exit()
    # Split into KEYSIZE blocks and crack individually
    for keysize in [16]:
        k = [0] * keysize

        blocks = [enc[i:i+keysize] for i in range(0, len(enc), keysize)]
        # Transpose the blocks and find the key for each transposed block
        # Leave the last incomplete block for easier calculations
        t_blocks = transpose(blocks[:len(blocks)-1])

        for i in xrange(keysize):
            b = t_blocks[i]

            d = {}
            for key in range(0, 255):
                dec = "".join([chr(ord(c)^key) for c in b])
                d[dec] = (get_score(dec), key, b)

            print "Possible keys for transposed block %d. keysize: %d" % (i, keysize)
         
            for w in sorted(d, key = lambda k: d[k][0], reverse=True)[:1]:
                print d[w][1], "score:", d[w][0]
                k[i] = d[w][1]

        k = "".join([chr(c) for c in k])
        print "Key:", k.encode("hex").upper()
        # print "Message:", rxor(enc, k)     
	o = open("crypt-dec.txt", "w")
	o.write(rxor(enc, k))
	o.close()