import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# You may find this function useful
def chars_to_dict(chars):
    chars = sorted(list(set(chars)))
    char_to_index  = {x: i for i, x in enumerate(chars)}
    
    return char_to_index

prob_mixture_iid_seq(string, char_to_index, alpha=1/2)

def prob_mixture_iid_seq(string, abc, alpha=1/2):
    ### FILL IN THE CODE ###

    return dist_array

A = '10111111'
B = '0010110111'
C = '00121212102101210'
D = 'AGTTTTCGTAACGTT'

prob_mixture_iid_seq(A, '01', alpha=1/2)

array([[0.5       , 0.5       ],
       [0.25      , 0.75      ],
       [0.5       , 0.5       ],
       [0.375     , 0.625     ],
       [0.3       , 0.7       ],
       [0.25      , 0.75      ],
       [0.21428571, 0.78571429],
       [0.1875    , 0.8125    ],
       [0.16666667, 0.83333333]])

prob_mixture_iid_seq(B, '01', alpha=1)

array([[0.5       , 0.5       ],
       [0.66666667, 0.33333333],
       [0.75      , 0.25      ],
       [0.6       , 0.4       ],
       [0.66666667, 0.33333333],
       [0.57142857, 0.42857143],
       [0.5       , 0.5       ],
       [0.55555556, 0.44444444],
       [0.5       , 0.5       ],
       [0.45454545, 0.54545455],
       [0.41666667, 0.58333333]])

prob_mixture_iid_seq(B, '012', alpha=1/2)

array([[0.33333333, 0.33333333, 0.33333333],
       [0.6       , 0.2       , 0.2       ],
       [0.71428571, 0.14285714, 0.14285714],
       [0.55555556, 0.33333333, 0.11111111],
       [0.63636364, 0.27272727, 0.09090909],
       [0.53846154, 0.38461538, 0.07692308],
       [0.46666667, 0.46666667, 0.06666667],
       [0.52941176, 0.41176471, 0.05882353],
       [0.47368421, 0.47368421, 0.05263158],
       [0.42857143, 0.52380952, 0.04761905],
       [0.39130435, 0.56521739, 0.04347826]])

prob_mixture_iid_seq(C, '012')

array([[0.33333333, 0.33333333, 0.33333333],
       [0.6       , 0.2       , 0.2       ],
       [0.71428571, 0.14285714, 0.14285714],
       [0.55555556, 0.33333333, 0.11111111],
       [0.45454545, 0.27272727, 0.27272727],
       [0.38461538, 0.38461538, 0.23076923],
       [0.33333333, 0.33333333, 0.33333333],
       [0.29411765, 0.41176471, 0.29411765],
       [0.26315789, 0.36842105, 0.36842105],
       [0.23809524, 0.42857143, 0.33333333],
       [0.30434783, 0.39130435, 0.30434783],
       [0.28      , 0.36      , 0.36      ],
       [0.25925926, 0.40740741, 0.33333333],
       [0.31034483, 0.37931034, 0.31034483],
       [0.29032258, 0.41935484, 0.29032258],
       [0.27272727, 0.39393939, 0.33333333],
       [0.25714286, 0.42857143, 0.31428571],
       [0.2972973 , 0.40540541, 0.2972973 ]])

prob_mixture_iid_seq(D, 'AGTC')

array([[0.25      , 0.25      , 0.25      , 0.25      ],
       [0.5       , 0.16666667, 0.16666667, 0.16666667],
       [0.375     , 0.125     , 0.375     , 0.125     ],
       [0.3       , 0.1       , 0.3       , 0.3       ],
       [0.25      , 0.08333333, 0.25      , 0.41666667],
       [0.21428571, 0.07142857, 0.21428571, 0.5       ],
       [0.1875    , 0.0625    , 0.1875    , 0.5625    ],
       [0.16666667, 0.16666667, 0.16666667, 0.5       ],
       [0.15      , 0.15      , 0.25      , 0.45      ],
       [0.13636364, 0.13636364, 0.22727273, 0.5       ],
       [0.20833333, 0.125     , 0.20833333, 0.45833333],
       [0.26923077, 0.11538462, 0.19230769, 0.42307692],
       [0.25      , 0.17857143, 0.17857143, 0.39285714],
       [0.23333333, 0.16666667, 0.23333333, 0.36666667],
       [0.21875   , 0.15625   , 0.21875   , 0.40625   ],
       [0.20588235, 0.14705882, 0.20588235, 0.44117647]])

prob_mixture_markov_seq(string, char_to_index, alpha=1/2, k=0)

def prob_mixture_markov_seq(string, abc, alpha=1/2, k=0, init_state=None):
    ### FILL IN THE CODE ###
    
    return dist_array

A, prob_mixture_markov_seq(A, chars_to_dict('012'), alpha=1/2, k=1)

('10111111', array([[0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.6       , 0.2       ],
        [0.6       , 0.2       , 0.2       ],
        [0.42857143, 0.42857143, 0.14285714],
        [0.33333333, 0.55555556, 0.11111111],
        [0.27272727, 0.63636364, 0.09090909],
        [0.23076923, 0.69230769, 0.07692308],
        [0.2       , 0.73333333, 0.06666667]]))

A, prob_mixture_markov_seq(A, chars_to_dict('012'), alpha=1/2, k=2)

('10111111', array([[0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.6       , 0.2       , 0.2       ],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.6       , 0.2       ],
        [0.14285714, 0.71428571, 0.14285714],
        [0.11111111, 0.77777778, 0.11111111],
        [0.09090909, 0.81818182, 0.09090909]]))

B, prob_mixture_markov_seq(B, chars_to_dict('01'), alpha=1/2, k=1)

('0010110111', array([[0.5       , 0.5       ],
        [0.75      , 0.25      ],
        [0.83333333, 0.16666667],
        [0.5       , 0.5       ],
        [0.625     , 0.375     ],
        [0.75      , 0.25      ],
        [0.5       , 0.5       ],
        [0.5       , 0.5       ],
        [0.625     , 0.375     ],
        [0.5       , 0.5       ],
        [0.41666667, 0.58333333]]))

B, prob_mixture_markov_seq(B, chars_to_dict('012'), alpha=1/2, k=2)

('0010110111', array([[0.33333333, 0.33333333, 0.33333333],
        [0.6       , 0.2       , 0.2       ],
        [0.71428571, 0.14285714, 0.14285714],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.6       , 0.2       , 0.2       ],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.6       , 0.2       ],
        [0.42857143, 0.42857143, 0.14285714],
        [0.6       , 0.2       , 0.2       ],
        [0.42857143, 0.42857143, 0.14285714]]))

C, prob_mixture_markov_seq(C, chars_to_dict('012'), alpha=1/2, k=3)

('00121212102101210', array([[0.33333333, 0.33333333, 0.33333333],
        [0.6       , 0.2       , 0.2       ],
        [0.71428571, 0.14285714, 0.14285714],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.2       , 0.6       ],
        [0.2       , 0.6       , 0.2       ],
        [0.14285714, 0.14285714, 0.71428571],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.2       , 0.6       ],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.6       , 0.2       ],
        [0.33333333, 0.11111111, 0.55555556],
        [0.14285714, 0.42857143, 0.42857143]]))

C, prob_mixture_markov_seq(C, chars_to_dict('012'), beta=1/2, k=5)

('00121212102101210', array([[0.33333333, 0.33333333, 0.33333333],
        [0.6       , 0.2       , 0.2       ],
        [0.71428571, 0.14285714, 0.14285714],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.2       , 0.2       , 0.6       ],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333],
        [0.33333333, 0.33333333, 0.33333333]]))

prob_ctw_seq(string, chars)

def prob_ctw_seq(string, char_to_index, max_depth=5):
    num_abc = len(char_to_index)
    dist_array = np.zeros((len(string)+1, num_abc))
    ### FILL IN THE CODE ###
    
    return dist_array

prob_mixture_iid_seq(A, chars_to_dict('01'), beta=1/2)

array([[0.5       , 0.5       ],
       [0.25      , 0.75      ],
       [0.5       , 0.5       ],
       [0.375     , 0.625     ],
       [0.3       , 0.7       ],
       [0.25      , 0.75      ],
       [0.21428571, 0.78571429],
       [0.1875    , 0.8125    ],
       [0.16666667, 0.83333333]])

prob_ctw_seq(A, chars_to_dict('01'), max_depth=0)

array([[0.5       , 0.5       ],
       [0.25      , 0.75      ],
       [0.5       , 0.5       ],
       [0.375     , 0.625     ],
       [0.3       , 0.7       ],
       [0.25      , 0.75      ],
       [0.21428571, 0.78571429],
       [0.1875    , 0.8125    ],
       [0.16666667, 0.83333333]])

prob_ctw_seq(A, chars_to_dict('01'), max_depth=1)

array([[0.5       , 0.5       ],
       [0.375     , 0.625     ],
       [0.33333333, 0.66666667],
       [0.65625   , 0.34375   ],
       [0.40909091, 0.59090909],
       [0.30769231, 0.69230769],
       [0.25      , 0.75      ],
       [0.21180556, 0.78819444],
       [0.18428781, 0.81571219]])

prob_ctw_seq(A, chars_to_dict('01'), max_depth=2)

array([[0.5       , 0.5       ],
       [0.375     , 0.625     ],
       [0.41666667, 0.58333333],
       [0.64285714, 0.35714286],
       [0.4       , 0.6       ],
       [0.27604167, 0.72395833],
       [0.21942446, 0.78057554],
       [0.18317972, 0.81682028],
       [0.15726375, 0.84273625]])

prob_ctw_seq(A, chars_to_dict('01'), max_depth=5)

array([[0.5       , 0.5       ],
       [0.375     , 0.625     ],
       [0.41666667, 0.58333333],
       [0.59821429, 0.40178571],
       [0.41111111, 0.58888889],
       [0.30896226, 0.69103774],
       [0.23805461, 0.76194539],
       [0.19204927, 0.80795073],
       [0.16016979, 0.83983021]])

prob_ctw_seq(B, chars_to_dict('01'), max_depth=2)

array([[0.5       , 0.5       ],
       [0.75      , 0.25      ],
       [0.83333333, 0.16666667],
       [0.5625    , 0.4375    ],
       [0.63888889, 0.36111111],
       [0.67307692, 0.32692308],
       [0.5       , 0.5       ],
       [0.47794118, 0.52205882],
       [0.53169014, 0.46830986],
       [0.54135338, 0.45864662],
       [0.42418033, 0.57581967]])

prob_ctw_seq(B, chars_to_dict('01'), max_depth=5)

array([[0.5       , 0.5       ],
       [0.75      , 0.25      ],
       [0.83333333, 0.16666667],
       [0.5625    , 0.4375    ],
       [0.63888889, 0.36111111],
       [0.63942308, 0.36057692],
       [0.5       , 0.5       ],
       [0.5       , 0.5       ],
       [0.5       , 0.5       ],
       [0.55833333, 0.44166667],
       [0.43207547, 0.56792453]])

prob_ctw_seq(B, chars_to_dict('012'), max_depth=5)

array([[0.33333333, 0.33333333, 0.33333333],
       [0.6       , 0.2       , 0.2       ],
       [0.71428571, 0.14285714, 0.14285714],
       [0.44444444, 0.33333333, 0.22222222],
       [0.56439394, 0.29545455, 0.14015152],
       [0.53629191, 0.32061144, 0.14309665],
       [0.43818019, 0.43818019, 0.12363961],
       [0.48503079, 0.42460566, 0.09036355],
       [0.46390453, 0.45094078, 0.08515469],
       [0.45137422, 0.47190338, 0.0767224 ],
       [0.38970028, 0.54998224, 0.06031749]])

prob_ctw_seq(D, chars_to_dict('AGTC'), max_depth=1)

array([[0.25      , 0.25      , 0.25      , 0.25      ],
       [0.5       , 0.16666667, 0.16666667, 0.16666667],
       [0.3125    , 0.1875    , 0.3125    , 0.1875    ],
       [0.26666667, 0.2       , 0.26666667, 0.26666667],
       [0.19791667, 0.13541667, 0.19791667, 0.46875   ],
       [0.1547619 , 0.10714286, 0.1547619 , 0.58333333],
       [0.125     , 0.08928571, 0.125     , 0.66071429],
       [0.23333333, 0.23333333, 0.23333333, 0.3       ],
       [0.16428571, 0.16428571, 0.17857143, 0.49285714],
       [0.09025033, 0.23517787, 0.10210804, 0.57246377],
       [0.34215328, 0.125     , 0.34215328, 0.19069343],
       [0.47230769, 0.10184615, 0.28707692, 0.13876923],
       [0.17799597, 0.16828514, 0.45630125, 0.19741764],
       [0.13076377, 0.12721684, 0.13076377, 0.61125562],
       [0.21442819, 0.21243351, 0.07613032, 0.49700798],
       [0.18797954, 0.18644501, 0.06624041, 0.55933504]])

### FILL IN THE CODE ###

### FILL IN THE CODE ###

### FILL IN THE CODE ###

from arithmetic_enc_public import *

def compress_with_CTW(filename, char_to_index=None, max_depth=5, verbose=False):
    file = open(filename, 'r', encoding="utf8")
    string = file.read()
    file.close()
    if char_to_index is None:
        char_to_index = chars_to_dict(string)
    
    dist_array = prob_ctw_seq(string, char_to_index, max_depth)
    encoded_bits = arithmetic_enc_offline(string, char_to_index, dist_array)
    
    if verbose:
        compress_ratio = len(encoded_bits) / (len(string) * np.log2(len(char_to_index)))
        print("The compression ratio is {}%.".format(compress_ratio*100))
        
    return compress_ratio

%time compress_with_CTW('data/CElegans_sample.txt', chars_to_dict('AGTC'), max_depth=5, verbose=True);

nums = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'} 
en_abc = {' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 
          'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
fr_letters = {'é', 
              'à', 'è', 'ù',
              'â', 'ê', 'î', 'ô', 'û',
              'ç',
              'ë', 'ï', 'ü'}

filename_list = ['data/sample_the_little_prince_en.txt',
                 'data/sample_the_little_prince_fr.txt']
alphabet_list = [en_abc, en_abc|fr_letters]

for max_depth in [5,10]:
    print("# For max_depth={}".format(max_depth))
    for filename, alphabet in zip(filename_list, alphabet_list):
        print("\tFilename '{}': ".format(filename), end = '')
        %time compress_with_CTW(filename, chars_to_dict(alphabet),\
                                max_depth=max_depth, verbose=True);

ECE 225B: Universal Probability and Applications @UCSD¶

Homework 2: Mixture probability and context tree weighting¶

Problem 1. (Mixture of i.i.d. probabilities)¶

Problem 2. (Mixture of Markov probabilities)¶

Problem 3. (Context tree weighting)¶

NOTE: How to implement conditional probability efficiently?¶

Algorithm¶

Implementation¶

Problem 4. (Minimax mean universality of mixtures)¶

Problem 4(a).¶

Problem 4(b).¶

Problem 4(c).¶

Problem 5. (Universal compression with CTW)¶

Problem 5(a). Genome sequence¶

Problem 5(b). The Little Prince¶