#!/usr/bin/env python ''' Break repeating-key XOR ~~~~~~~~~~~~~~~~~~~~~~~~ a. Let KEYSIZE be the guessed length of the key; try values from 2 to (say) 40. b. Write a function to compute the edit distance/Hamming distance between two strings. The Hamming distance is just the number of differing bits. The distance between: this is a test and: wokka wokka!!! is 37. c. For each KEYSIZE, take the FIRST KEYSIZE worth of bytes, and the SECOND KEYSIZE worth of bytes, and find the edit distance between them. Normalize this result by dividing by KEYSIZE. d. The KEYSIZE with the smallest normalized edit distance is probably the key. You could proceed perhaps with the smallest 2-3 KEYSIZE values. Or take 4 KEYSIZE blocks instead of 2 and average the distances. e. Now that you probably know the KEYSIZE: break the ciphertext into blocks of KEYSIZE length. f. Now transpose the blocks: make a block that is the first byte of every block, and a block that is the second byte of every block, and so on. g. Solve each block as if it was single-character XOR. You already have code to do this. h. For each block, the single-byte XOR key that produces the best looking histogram is the repeating-key XOR key byte for that block. Put them together and you have the key. ''' import base64 import sys # Return the number of different bits netween two 32 bits numbers def dif(a, b, bits = 32): x = a ^ b return sum((x>>i & 1) for i in xrange(bits)) # Compute the edit/Hamming distance between two strings (of equal lengths) # Returns number of differing bits def d_ham(s1, s2): return sum(dif(ord(s1[i]), ord(s2[i])) for i in xrange(len(s1))) # Used below for single-char xor decoding (step f.) def transpose(blocks): t = [] for i in xrange(len(blocks[0])): t1 = [] for j in xrange(len(blocks)): t1.append(blocks[j][i]) t.append("".join(t1)) return t charset = "abcdefghijklmnopqrstuvwxyz " charset_up = charset.upper() # Simple scoring method based on character frequency def get_score(text): # Score: number of valid characters s = 0 for l in text: if (l in charset) or (l in charset_up): s += 1 return (s*1.0)/len(text) # Decrypt/encrypt function are the same def rxor(text, key): dec = [chr(ord(text[i]) ^ ord(key[i % len(key)])) for i in range(0, len(text))] return "".join(dec) if __name__ == "__main__": fname = sys.argv[1] f = open(fname, "rb") enc = f.read() f.close() # Find the most probable key sizes d = {} for keysize in range(2, 40): blocks = [enc[i:i+keysize] for i in range(0, len(enc), keysize)] dist = 0 # Average Hamming distance over 5 blocks for i in range(0,5): for j in range(i+1, 5): dist += d_ham(blocks[i], blocks[j])*1.0/keysize d[keysize] = dist/10 print "Most possible key sizes: " for w in sorted(d, key = lambda k: d[k])[:7]: print "Key size:", w, "Hamming distance:", d[w] sys.exit() # Split into KEYSIZE blocks and crack individually for keysize in [16]: k = [0] * keysize blocks = [enc[i:i+keysize] for i in range(0, len(enc), keysize)] # Transpose the blocks and find the key for each transposed block # Leave the last incomplete block for easier calculations t_blocks = transpose(blocks[:len(blocks)-1]) for i in xrange(keysize): b = t_blocks[i] d = {} for key in range(0, 255): dec = "".join([chr(ord(c)^key) for c in b]) d[dec] = (get_score(dec), key, b) print "Possible keys for transposed block %d. keysize: %d" % (i, keysize) for w in sorted(d, key = lambda k: d[k][0], reverse=True)[:1]: print d[w][1], "score:", d[w][0] k[i] = d[w][1] k = "".join([chr(c) for c in k]) print "Key:", k.encode("hex").upper() # print "Message:", rxor(enc, k) o = open("crypt-dec.txt", "w") o.write(rxor(enc, k)) o.close()