import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

prob_LZ78_seq(string, chars)

# You may find this function useful
def chars_to_dict(chars):
    chars = sorted(list(set(chars)))
    char_to_index  = {x: i for i, x in enumerate(chars)}
    
    return char_to_index

def prob_LZ78_seq(string, char_to_index):
    """
    - string: str, a sequence to be parsed
    - char_to_index: dict, a dictionary which maps a character to its index
    """
    dist_array = np.zeros((len(string)+1, len(char_to_index)))  # i-th row is the sequential distribution
    
    ### FILL IN THE CODE ###
    
    return dist_array

A = '10111111'
B = '0010110111'
C = '00121212102101210'
D = 'AGTTTTCGTAACGTT'

prob_LZ78_seq(A, chars_to_dict('01'))

prob_LZ78_seq(B, chars_to_dict('01'))

prob_LZ78_seq(B, chars_to_dict('012'))

prob_LZ78_seq(C, chars_to_dict('012'))

prob_LZ78_seq(D, chars_to_dict('AGTC'))

log2_prob_LZ78(string, char_to_index)

def log2_prob_LZ78(string, char_to_index):
    """
    - string: str, a sequence to be parsed
    - char_to_index: dict, a mapping from character (symbol) to the index
    """
    dist_array = prob_LZ78_seq(string, char_to_index)
    ### FILL IN THE CODE ###
    
    return log2prob

A, log2_prob_LZ78(A, chars_to_dict('01'))

B, log2_prob_LZ78(B, chars_to_dict('01'))

B, log2_prob_LZ78(B, chars_to_dict('012'))

C, log2_prob_LZ78(C, chars_to_dict('012'))

D, log2_prob_LZ78(D, chars_to_dict('AGTC'))

>>> draw_one_sample_from([0.8, 0.2])
0

def draw_one_sample_from(pmf):
    ### FILL IN THE CODE ###

def generate_mc(n, P, pinit=None):
    '''
    - n: length of output sequence
    - P: transition matrix
    - pinit: initial distribution
    '''
    x = np.zeros(n, dtype=np.int)
    log2_prob = 0
    ### FILL IN THE CODE ###
        
    return x, log2_prob

nlist = [10, 50, 100, 500, 1000, 5000, 10000]
plist = np.arange(0.02, 0.51, 0.02)

### FILL IN THE CODE ###

# Plot the mean redundancy graph
### FILL IN THE CODE ###

'''
**************************
**************************
*** ARITHMETIC ENCODER ***
**************************
**************************
'''
def arithmetic_enc_offline(string, char_to_index, pmfs, STATE_SIZE=32):
    ZERO = 0
    ONE = (1 << STATE_SIZE) - 1
    ONE_FOURTH = 1 << (STATE_SIZE - 2)
    HALF = ONE_FOURTH * 2
    THREE_FOURTH = HALF + ONE_FOURTH
    # Mask of STATE_SIZE ones, i.e., 111...111.
    MASK = ONE
    # Mask of the top bit at width STATE_SIZE, i.e., 100...000.
    TOP_MASK = HALF
    # Mask of the second highest bit at width STATE_SIZE, i.e., 010...000.
    SECOND_MASK = TOP_MASK >> 1

    low, high = ZERO, ONE
    low_float, high_float = 0.0, 1.0
    encoded_bits = ''
    num_pending_bits = 0
    
    for i, symbol in enumerate(string):
        curr_cmf = np.hstack(([0], np.cumsum(pmfs[i])))
        curr_idx = char_to_index[symbol]
        
        p_prev_symb = high - low + 1 # range

        high = int(low + p_prev_symb*curr_cmf[curr_idx+1] / curr_cmf[-1]) - 1
        low  = int(low + p_prev_symb*curr_cmf[curr_idx]   / curr_cmf[-1])
        
        assert ZERO <= low 
        assert low  < high 
        assert high <= ONE
        
        while 1:
            if (low ^ high) & TOP_MASK == 0:  # if MSBs are equal
                
                bit = 0 if high & TOP_MASK == 0 else 1
                encoded_bits += output_bit_plus_pending(bit, num_pending_bits)
                num_pending_bits = 0
                low  =  (low  << 1) & MASK
                high = ((high << 1) & MASK) | 1
                
                assert ZERO <= low < high <= ONE, (ZERO<=low, high<=ONE, bin_in_bits(low,STATE_SIZE), bin_in_bits(high,STATE_SIZE))
                
            elif (low & SECOND_MASK) != 0 and (high & SECOND_MASK) == 0:  # if low=01... and high=10...                
                num_pending_bits += 1
                low  = ( (low  ^ SECOND_MASK) << 1)       & MASK         # flip the second MSB and shift in 0 left
                high = (((high ^ SECOND_MASK) << 1) | 1) & MASK   # flip the second MSB and shift in 1 left
                
                assert ZERO <= low < high <= ONE, (ZERO<=low, high<=ONE, bin_in_bits(low,STATE_SIZE), bin_in_bits(high,STATE_SIZE))
            else:
                break
                
    # Terminate: adapted from David MacKay's code
    if (HALF-low) > (high-HALF):
        w = HALF-low
        bit = 0
    else:
        w = high-HALF
        bit = 1
    
    encoded_bits += output_bit_plus_pending(bit, num_pending_bits)
    num_pending_bits = 0
    while w < HALF:
        encoded_bits += str(1-bit)
        w *= 2

    return encoded_bits

def output_bit_plus_pending(bit, num_pending_bits):
    assert bit in {0, 1}
    return str(bit) + num_pending_bits * str(1-bit)

def compress_with_LZ_prob(string, char_to_index):
    pmfs = prob_LZ78_seq(string, char_to_index)
    return arithmetic_enc_offline(string, char_to_index, pmfs)

n = 10000
N = 50
plist = np.arange(0.02, 0.5, 0.02)
compress_ratio_list = np.zeros(len(plist))
for i, p in enumerate(plist):
    
    ### FILL IN THE CODE ###
    
    for _ in range(N):
        
        ### FILL IN THE CODE ###
        
compress_ratio_list /= N

# Plot the compression ratios
### FILL IN THE CODE ###

def compress(filename, char_to_index=None):
    file = open(filename, 'r', encoding="utf8")
    string = file.read()
    file.close()
    if char_to_index is None:
        char_to_index = chars_to_dict(string)
        
    encoded_bits = compress_with_LZ_prob(string, char_to_index)
    compress_ratio = len(encoded_bits) / (len(string) * np.ceil(np.log2(len(char_to_index))))
    
    print("The compression ratio is {}%.".format(compress_ratio*100))

%time compress('data/CElegans_sample.txt', chars_to_dict('AGTC'))

en_abc = {' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 
          'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
fr_letters = {'é', 
              'à', 'è', 'ù',
              'â', 'ê', 'î', 'ô', 'û',
              'ç',
              'ë', 'ï', 'ü'}

filename_list = ['data/sample_the_little_prince_en.txt',
                 'data/sample_the_little_prince_fr.txt']
alphabet_list = [en_abc, en_abc|fr_letters]

for filename, alphabet in zip(filename_list, alphabet_list):
    print("For {}:".format(filename), end = '')
    %time compress(filename, chars_to_dict(alphabet))

ECE 225B: Universal Probability and Applications @UCSD¶

Homework 1: Lempel-Ziv probability assignment and universal compression¶

Problem 1. (LZ probability assignment)¶

Problem 1(a).¶

Solution to #1(a).¶

Examples¶

Problem 1(b).¶

Solution to #1(b).¶

Examples¶

Problem 2. (Minimax mean universality)¶

Solution to #2.¶

Problem 3. (Universal compression using arithmetic encoder)¶

Arithmetic encoder implementation¶

(a) Binary symmetric Markov chain¶

(b) Genome sequence¶

(c) The Little Prince: Which one is the more efficient language, English or French?¶