In this homework, we will reuse the functions from the last two homeworks. (Tunstall coding, binary Huffman coding)
using PyPlot
using NBInclude
nbinclude("Basic Functions.ipynb");
nbinclude("Binary Huffman Coding.ipynb");
nbinclude("Tunstall Coding.ipynb");
src_pmf = [0.9, 0.1]
avg_lens = zeros(1000)
for k = 2:1000
phrases, pmf = Tunstall(src_pmf, k)
code = binaryHuffman(pmf)
avg_len_codewords = avgLength(code,pmf)
avg_len_phrases = avgLength(phrases, pmf)
avg_lens[k] = avg_len_codewords/avg_len_phrases
end
plot(3:1000, avg_lens[3:1000], marker = "None", label = "Compression ratio")
axhline(entropy(src_pmf), color = "g", linestyle = "--", label = "Entropy")
xlabel("num_phrases")
ylabel("compression ratio")
legend(loc = "upper right", fancybox = "true")
As one can check, the compression ratio seems to converge the entropy, though it seems to fluctuate.
avg_lens[end] - entropy(src_pmf)
Suppose a Bernoulli source, say $\mathcal{X} = \{0,1\}$. How can we evaluate all possible trees? Here we assume a binary tree. Let $d$ be the number of phrases. we can enumerate all possibilites by induction.
The inputs are pmf = $[p, 1-p]$ and num_phrases = (the number of phrases).
The outputs are all possible pmf of size num_phrases, and corresponding phrases.
Let pmf1 = [9, 1].
k=1, [[9,1]]
k=2, we want [[81,9,1], [9,9,1]], 1 submatrix of size (2, 3).
k=3, we want [[729, 81, 9, 1], [81, 81, 9, 1], [81, 9, 9, 1]] + [[81, 9, 9, 1], [9, 81, 9, 1], [9, 9, 9, 1]], 2 submatrices of size (3, 4).
k=4, 6 submatrices of size (4, 5).
k=5, 24 submatrices of size (5, 6).
For general k, (k-1)! submatrices of size (k, k+1) is needed.
Suppose we have pmf_list from the previous iteration, which is a list of pmfs. What we want to do is to pick each pmf in the list, enumerate all extension, and add it. Note that there exists a duplicate, but for our purpose this construction is enough.
# pmf_list is the list of all possible distributions of a given number of phrases.
# phrase_list is the list of all possible distributions of a given number of phrases.
# We will keep updating pmf_list and phrase_list through the iterations.
# We save newly extended pmfs and phrases in pmf_list_new, phrases_list_new.
for p1 in [0.9, 0.7]
pmf1 = [p1, 1-p1]
symbol1 = ["A", "B"]
phrases_list = [symbol1]
pmf_list = [pmf1]
max_num_phrases = 10
for k = 3:max_num_phrases
pmf_list_new = Array{Float64,1}[]
phrases_list_new = Array{String,1}[]
for j = 1:length(pmf_list) # over all possible distributions
for i = 1:length(pmf_list[j]) # at each symbol s, extend it to s"A" and s"B"
pmf_new = copy(pmf_list[j])
phrases_new = copy(phrases_list[j])
splice!(pmf_new, i, pmf_new[i]*pmf1)
splice!(phrases_new, i, map(x -> "$(phrases_new[i])$x", symbol1))
push!(pmf_list_new, pmf_new)
push!(phrases_list_new, phrases_new)
# to check whether it is generating a correct pmf
# print("kji: ", k, j, i, " ")
# print("newly generated ", pmf_new, " ")
# print("newly generated phrases ", phrases_new, "\n")
end
end
pmf_list = copy(pmf_list_new)
phrases_list = copy(phrases_list_new)
# if you want to check the generated pmfs
# print(pmf_list, "\n")
# Below is to compare the performance of Tunstall+Huffman with the optimal tree.
avg_lengths = zeros(length(pmf_list))
for l in 1:length(pmf_list)
phrases, pmf = phrases_list[l], pmf_list[l]
avg_lengths[l] = avgLength(binaryHuffman(pmf), pmf)/avgLength(phrases, pmf)
end
phrases, pmf = Tunstall(pmf1, k)
avg_length_Tunstall = avgLength(binaryHuffman(pmf), pmf)/avgLength(phrases, pmf)
min_avg_length, min_index = findmin(avg_lengths)
print("------------------------------\n")
print("For the number of phrases = ", k, ", \n")
print("pmf by Tunstall: ", pmf, "\n")
print("Tunstall+Huffman average length: ", avg_length_Tunstall, "\n")
print("The optimal average length: \t ", min_avg_length, "\n")
print("Phrases by Tunstall ", phrases, "\n")
print("Phrases by the optimal tree ", phrases_list[min_index], "\n")
if abs(avg_length_Tunstall-minimum(avg_lengths)) < 1e-7
print("===> Tunstall+Huffman is optimal!\n")
else
print("===> There is a better tree when the number of phrases = ", k, "!\n")
break
end
end
print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")
end
In this homework, we implement the sliding-window LZ77 encoding scheme. (Lempel and Ziv, 1977)
function matchPositionLength(window, text)
if length(window) == 0
return (0, text[1])
end
merged = "$window$text"
indices = find(x -> x==text[1], [window...])
if length(indices) > 0
matched_len_list = zeros(length(indices))
for i in 1:length(indices)
idx = indices[i]
len_temp = 1
while len_temp+1 <= length(text)
if merged[idx+len_temp] == text[len_temp+1]
len_temp += 1
else
# print(merged[idx:idx+len_temp-1]) # print the matched sequence; for debugging
break
end
end
matched_len_list[i] = len_temp
end
# print(matched_len_list) # for debugging
matched_len = maximum(matched_len_list) # find the length of the longest match
j = maximum(find(x -> x == matched_len, matched_len_list)) # find the largest index of the longest match
return (1, length(window)-indices[j], Int(matched_len))
else # if there is no match
return (0, text[1])
end
end
matchPositionLength("MY ", "MY MY WHAT A HAT IS THAT")
matchPositionLength("AAAA","AAABAA")
matchPositionLength("AA", "CAA")
(0,\texttt{‘T’}), (1,4,1), (1,2,1), (1,1,1), (1,5,4), (0,\texttt{‘I’}), (0,\texttt{‘S’}),\ (1,2,1),\ (1,4,1),\ (1,7,3)$
Simply, we can encode a length $L$ with $2\log L$ bits.
function lengthEncoding1(l)
enc = ["$c$c" for c in bin(l)]
enc[end] = (enc[end] == "00" ? "01" : "10");
return join(enc)
end
For example,
lengthEncoding1(5)
Similarly, there is another scheme to encode the length of the binary representation. We can simply add that number of 0's and 1 in front of the binary representation! This will also needs $2\log L$ bits.
function lengthEncoding2(l)
s = bin(l)
return join([join(['0' for i in 1:length(s)]), '1', s])
end
For example,
lengthEncoding2(5)
There is, however, a more clever way using about $\log L + 2\log\log L$ bits.
# Using log(l)+2log(log(l))+O(1) bits
function lengthEncoding3(l)
s = bin(l)
return join([lengthEncoding2(length(s)), s])
end
For example,
lengthEncoding3(5)
From now on, let's stick with the simplest scheme.
lengthEncoding = lengthEncoding1
function parseStringLZ77(input_text, window_size)
parsing = Array{Any}(0) # same as []
pointer = 1
while pointer <= length(input_text)
if pointer > window_size
window = input_text[pointer-window_size:pointer-1]
else
window = input_text[1:pointer-1]
end
code = matchPositionLength(window, input_text[pointer:end])
pointer += (code[1]==1 ? code[3] : 1) # code[3] is the matched length.
append!(parsing, [code])
end
return parsing
end
parseStringLZ77("AABBABAABABB", 6)
parseStringLZ77("MY MY WHAT A HAT IS THAT", 16)
# A function maps a character to 7-bit ascii
char2ascii(c) = bin(convert(Int, c), 7)
We can implement the final encoding function using parseStringLZ77.
function encodeLZ77a(input_text, window_size)
parsing = parseStringLZ77(input_text, window_size)
binary_code = []
for code in parsing
if code[1] == 1
pos, len = code[2:3]
temp = join([string(1), bin(pos, Int(ceil(log2(window_size)))), lengthEncoding(len)])
else
temp = join([string(0), char2ascii(code[2])])
end
append!(binary_code, [temp])
# print(code, " ", temp, "\n") # for debugging
end
return join(binary_code)
end
Of course, we can implement it in one shot as follows:
function encodeLZ77b(input_text, window_size)
binary_code = [] # initialize the window
pointer = 1;
while pointer <= length(input_text)
if pointer > window_size
window = input_text[pointer-window_size:pointer-1]
else
window = input_text[1:pointer-1]
end
temp = matchPositionLength(window, input_text[pointer:end])
if temp[1] == 1
flag, pos, len = temp
output = join([string(flag), bin(pos, Int(ceil(log2(window_size)))), lengthEncoding(len)])
pointer += len
else
output = join([string(0), char2ascii(input_text[pointer])])
pointer += 1
end
append!(binary_code, [output])
# print(pointer, " ", output,"\n") # for debugging
end
return join(binary_code)
end
encodeLZ77a("MY MY WHAT A HAT IS THAT", 16)
encodeLZ77b("MY MY WHAT A HAT IS THAT", 16)
f = open("sample_text.txt")
str = readstring(f)
close(f)
isascii(str)
print("We are using 7-bits ascii.\n-----------------------------------\n")
print("The input text has ", length(str), " characters.\n")
for window_size in [256, 512, 1024, 2048, 4096]
code = encodeLZ77a(str, window_size)
print("For a window size of ", window_size, ", the compressed length is ", length(code), " bits and the compression ratio is ", length(code)/length(str), " bits per symbol, or ", length(code)/length(str)/7*100, "%.\n")
end
We see that the compression ratio decreases with increase in window size till 2048, after which it increases. So, choosing an appropriate window size involves a tradeoff.