using PyPlot
using NBInclude
nbinclude("Tunstall Coding.ipynb");
nbinclude("Binary Huffman Coding.ipynb");
nbinclude("Non-binary Huffman Coding.ipynb");
nbinclude("Basic Functions.ipynb");
src_pmf = [0.9, 0.1]
# src_pmf = [0.5, 0.3, 0.1, 0.1]
avg_lens = zeros(500)
for k = 2:500
phrases, pmf = Tunstall(src_pmf, k)
avg_len_codewords = log2(length(phrases))
avg_len_phrases = avgLength(phrases, pmf)
avg_lens[k] = avg_len_codewords/avg_len_phrases
end
# [pmf phrases]
entropy(src_pmf)
plot(avg_lens)
avg_lens[end]
phrases, pmf = Tunstall(src_pmf, 6)
[pmf phrases]
phrases, pmf = Tunstall(src_pmf, 10)
[pmf phrases]
entropy(src_pmf)
# Average length of codewords
avg_len_codewords = log2(length(phrases))
# Average length of source phrase
avg_len_phrases = avgLength(phrases, pmf)
# Average # code symbols per average # source symbols
avg_len_codewords/avg_len_phrases
# Average # code symbols per sourse symbol
sum(pmf.*avg_len_codewords./map(length, phrases))
src_pmf = [0.9, 0.1]
phrases, pmf = Tunstall(src_pmf, 5)
code = daryHuffman(pmf, 2)
[pmf phrases code]
avg_len_codewords = avgLength(code, pmf) # Average length of codewords
avg_len_phrases = avgLength(phrases, pmf) # Average length of source phrase
avg_len_codewords/avg_len_phrases # Average # code symbols per average # source symbols
sum(pmf.*map(length, code)./map(length, phrases)) # Average # code symbols per sourse symbol
src_pmf = [0.9, 0.1]
num_phrases = 1000
avg_lengths = zeros(2,num_phrases)
for k = 2:num_phrases
phrases, pmf = Tunstall(src_pmf, k)
code = daryHuffman(pmf, 2)
avg_len_codewords = avgLength(code, pmf) # Average length of codewords
avg_len_phrases = avgLength(phrases, pmf) # Average length of source phrase
avg_lengths[1,k] = avg_len_codewords/avg_len_phrases # Average # code symbols per average # source symbols
avg_lengths[2,k] = sum(pmf.*map(length, code)./map(length, phrases)) # Average # code symbols per sourse symbol
end
plot(2:num_phrases, avg_lengths[1, 2:end])
plot(2:num_phrases, avg_lengths[2, 2:end])
axhline(entropy(src_pmf), color = "r", linestyle = "--")
# Does it converge?
avg_lengths[2, end] - avg_lengths[2, end-1]
Suppose a Bernoulli source, say $\mathcal{X} = \{0,1\}$. How can we evaluate all possible trees? Here we assume a binary tree. Let $d$ be the number of phrases. we can enumerate all possibilites by induction.
What is the input? pmf = $[p, 1-p]$, num_phrases = number of phrases
What is the output? all possible pmf of size num_phrases, and corresponding phrases
Let pmf1 = [9, 1].
k=1, [[9,1]]
k=2, we want [[81,9,1], [9,9,1]], 1 submatrix of size (2, 3).
k=3, we want [[729, 81, 9, 1], [81, 81, 9, 1], [81, 9, 9, 1]] + [[81, 9, 9, 1], [9, 81, 9, 1], [9, 9, 9, 1]], 2 submatrices of size (3, 4).
k=4, 6 submatrices of size (4, 5).
k=5, 24 submatrices of size (5, 6).
For general k, (k-1)! submatrices of size (k, k+1) is needed.
Suppose we have pmf_list from the previous iteration, which is a list of pmfs. What we want to do is to pick each pmf in the list, enumerate all extension, and add it.
# Suppose we keep updating pmf_list through the iterations.
# We save newly extended pmfs in pmf_list_new.
pmf1 = [0.9, 0.1]
symbol1 = ["A", "B"]
phrases_list = [symbol1]
pmf_list = [pmf1]
num_phrases = 10
for k = 3:num_phrases
pmf_list_new = Array{Float64,1}[]
phrases_list_new = Array{String,1}[]
for j = 1:length(pmf_list)
for i = 1:length(pmf_list[j])
pmf_new = copy(pmf_list[j])
phrases_new = copy(phrases_list[j])
splice!(pmf_new, i, pmf_new[i]*pmf1)
splice!(phrases_new, i, map(x -> "$(phrases_new[i])$x", symbol1))
# to check whether it generate a correct pmf
# print("kji: ", k, j, i, " ")
# print("newly generated ", pmf_new, " ")
# print("newly generated phrases ", phrases_new, "\n")
push!(pmf_list_new, pmf_new)
push!(phrases_list_new, phrases_new)
end
end
pmf_list = copy(pmf_list_new)
phrases_list = copy(phrases_list_new)
# if you want to check the pmfs generated
# print(pmf_list, "\n")
# Below is to compare the performance of Tunstall+Huffman with the optimal tree.
avg_lengths = zeros(length(pmf_list))
for l in 1:length(pmf_list)
phrases, pmf = phrases_list[l], pmf_list[l]
avg_lengths[l] = avgLength(binaryHuffman(pmf), pmf)/avgLength(phrases, pmf)
end
phrases, pmf = Tunstall(pmf1, k)
avg_length_Tunstall = avgLength(binaryHuffman(pmf), pmf)/avgLength(phrases, pmf)
print("------------------------------\n")
print("For the number of phrases = ", k, ", \n")
print("pmf by Tunstall: ", pmf, "\n")
print("Tunstall+Huffman average length: ", avg_length_Tunstall, "\n")
print("The optimum average length: \t ", minimum(avg_lengths), "\n")
if abs(avg_length_Tunstall-minimum(avg_lengths)) < 1e-7
print("Tunstall+Huffman is optimal!\n")
else
print("There is a better tree when the number of phrases = ", k, "!\n")
break
end
end