Homework #4 Programming Assignment Solution¶

using PyPlot

Problem 1. Capacity Computation¶

In this homework, we explore how we can compute the capactiy of a given discrete channel. We will consider three methods including

Exhaustive search by grid sampling of probability vectors
Exhaustive search by random sampling of probability vectors
Csiszar-Tusnady & Blahut-Arimoto algorithm

Basic functions¶

We define a function for finding the mutual information of given input distribution and channel.

function entropy(pmf, base)
    pmf = pmf/sum(pmf)
    pmf = pmf[~isinf(log(pmf))]
    return sum(pmf.*log(1./pmf)/log(base))
end
entropy(pmf) = entropy(pmf, 2)

entropy (generic function with 2 methods)

function mutualInfo(px, channel)
    m = size(channel, 1)
    channel_cond_entropy = [entropy(channel[i, :]) for i in 1:m] # [H(Y|X=x), x=1,...,m]
    return entropy(px*channel) - dot(px, channel_cond_entropy)
end
mutualInfo(px, channel, channel_cond_entropy) = entropy(px*channel) - dot(px, channel_cond_entropy)

mutualInfo (generic function with 2 methods)

We define a function for finding the capacity of a given channel and a searching space.

function computeCapacity(channel, prob_simplex)
    # size(channel,1) and size(prob_simplex,2) should match.
    m = size(channel, 1)
    channel_cond_entropy = [entropy(channel[i, :]) for i in 1:m] # [H(Y|X=x), x=1,...,m]
    simplex_size = size(prob_simplex, 1)
    mi_list = zeros(simplex_size, 1)
    for i in 1:simplex_size
        mi_list[i] = mutualInfo(prob_simplex[i, :]', channel, channel_cond_entropy)
    end
    capacity, maxidx = findmax(mi_list)
    (capacity, prob_simplex[maxidx, :])
end

computeCapacity (generic function with 1 method)

(1) Exhaustive search by grid sampling¶

We first enumerate all nonnegative integer solutions for $x_1+\ldots+x_n = k$. Note that there are $\binom{n+k-1}{k}$ many solutions. (Why?)

This enumeration can be implemented in a recursive manner: For each $x_1=0,\ldots,k$, we find all nonnegative integer solutions for $x_2+\ldots+x_n = k-x_1$.

function enumCombiWithRep(k, n)
    # Enumerate all nonnegative solutions of the equation x1+...+xn = k
    # Note that there are binomial(n+k-1,k) ways.
    if n == 2
        return [0:k k:-1:0]
    end
    list_solutions = zeros(1, n)
    for x1 = 0:k
        temp = enumCombiWithRep(k-x1, n-1)
        list_solutions = [list_solutions; [x1*ones(size(temp,1),1) temp]]
    end
    return list_solutions[2:end, :]
end

enumCombiWithRep (generic function with 1 method)

Using this function, we can easily generate grid samples probability simplex of a prespecified step size.

genProbSimplex(m, step_size=0.01) = step_size*enumCombiWithRep(Int(1/step_size), m)

genProbSimplex (generic function with 2 methods)

Note that the size of grid space grows really fast. For example,

m_list, step_size_list = 3:8, [0.1 0.05 0.01 0.005 0.001]
size_space_array = zeros(length(m_list), length(step_size_list))
for i in 1:length(m_list)
    for j in 1:length(step_size_list)
        k, n = Int(1/step_size_list[j]), m_list[i]
        size_space_array[i,j] = binomial(n+k-1, k)
    end
end
plot(1./step_size_list', log10(size_space_array)', marker = "*");
xlabel("1/step_size")
ylabel("log10(size of the search space)")

PyObject <matplotlib.text.Text object at 0x7f03a9539390>

Example¶

As a sanity check, we examine the following example:

channel = 
[0.5 0.3 0.2;
    0.2 0.5 0.3;
    0.3 0.2 0.5]

3×3 Array{Float64,2}:
 0.5  0.3  0.2
 0.2  0.5  0.3
 0.3  0.2  0.5

@time prob_simplex = genProbSimplex(size(channel, 1), 0.01);
@time computeCapacity(channel, prob_simplex)

  0.477537 seconds (384.74 k allocations: 25.068 MB, 1.56% gc time)
  0.970441 seconds (820.13 k allocations: 35.114 MB, 0.86% gc time)

We can take a finer step size for this simple case.

@time prob_simplex = genProbSimplex(size(channel, 1), 0.001);
@time computeCapacity(channel, prob_simplex)

  2.968185 seconds (20.76 k allocations: 7.534 GB, 16.18% gc time)
  0.610596 seconds (10.03 M allocations: 707.834 MB, 12.69% gc time)

(0.09948710250997461,[0.333,0.334,0.333])

Problem 1(a)¶

channel =
[0.5 0.3 0.2 0;
    0.3 0.5 0 0.2;
    0.5 0 0.3 0.2;
    0.2 0 0.5 0.3]

4×4 Array{Float64,2}:
 0.5  0.3  0.2  0.0
 0.3  0.5  0.0  0.2
 0.5  0.0  0.3  0.2
 0.2  0.0  0.5  0.3

@time prob_simplex = genProbSimplex(size(channel, 1), 0.01);
@time computeCapacity(channel, prob_simplex)

  0.201599 seconds (75.95 k allocations: 653.255 MB, 16.97% gc time)
  0.226809 seconds (3.54 M allocations: 249.612 MB, 12.54% gc time)

(0.5145247027726656,[0.0,0.5,0.0,0.5])

@time prob_simplex = genProbSimplex(size(channel, 1), 0.005);
@time computeCapacity(channel, prob_simplex)

  5.037945 seconds (302.99 k allocations: 9.546 GB, 36.51% gc time)
  1.905510 seconds (27.47 M allocations: 1.893 GB, 17.47% gc time)

(0.5145247027726656,[0.0,0.5,0.0,0.5])

Problem 1(b)¶

channel =
[0.4 0.4 0.2;
    0.3 0.3 0.4;
    0.2 0.5 0.3;
    0.2 0.2 0.6]

4×3 Array{Float64,2}:
 0.4  0.4  0.2
 0.3  0.3  0.4
 0.2  0.5  0.3
 0.2  0.2  0.6

@time prob_simplex = genProbSimplex(size(channel, 1), 0.01);
@time computeCapacity(channel, prob_simplex)

  0.141757 seconds (75.07 k allocations: 653.201 MB, 21.19% gc time)
  0.216439 seconds (3.54 M allocations: 249.612 MB, 13.49% gc time)

(0.12461920291555595,[0.52,0.0,0.0,0.48])

@time prob_simplex = genProbSimplex(size(channel, 1), 0.005);
@time computeCapacity(channel, prob_simplex)

  4.953095 seconds (302.99 k allocations: 9.546 GB, 37.69% gc time)
  1.796214 seconds (27.47 M allocations: 1.893 GB, 17.90% gc time)

(0.12462842663416063,[0.515,0.0,0.0,0.485])

Problem 1(c)¶

For the random experiment, let us define a function which outputs a random channel as specified in the problem. For the sake of generality, we take $m$, the input alphabet size, as an input.

function genRandChannel(m)
    channel = 1/2*eye(m)
    for i in 1:m
        idx = rand(1:m-1)
        idx = idx>=i ? idx+1 : idx
        channel[i, idx] = 1/2
    end
    return channel
end

genRandChannel (generic function with 1 method)

m = 6
@time prob_simplex = genProbSimplex(m, 0.02);
niter = 100
@time capacity_list = [computeCapacity(genRandChannel(m), prob_simplex)[1] for i in 1:niter];

  8.334563 seconds (4.17 M allocations: 16.750 GB, 34.89% gc time)
533.917146 seconds (6.96 G allocations: 530.982 GB, 11.56% gc time)

As you can see, even with the crude grid sampling of step size 0.02, this takes almost 10 mins.

For this problem, we can find the lower and upper bound. (We will check this shortly.)

lower_bound, upper_bound = (log2(m-1)/2, log2(m/2))

(1.160964047443681,1.584962500721156)

Caxis = lower_bound-0.1:1e-4:upper_bound+0.1
empcdf = zeros(length(Caxis))
for i in 1:length(Caxis)
    empcdf[i] = sum(capacity_list.<=Caxis[i])
end
empcdf /= length(capacity_list);
plot(Caxis, empcdf)
xlabel("Capacity")
ylabel("Probability")
title("CDF of capacities")
(minimum(capacity_list), maximum(capacity_list))

(1.1609640474436813,1.5843814577244943)

Digression: What are the best and the worst possible channels?¶

The upper bound is achieved when the channel is a so-called noisy typewriter channel.

channel_best = 
0.5*[1 1 0 0 0 0; 
    0 1 1 0 0 0;
    0 0 1 1 0 0;
    0 0 0 1 1 0;
    0 0 0 0 1 1;
    1 0 0 0 0 1]

6×6 Array{Float64,2}:
 0.5  0.5  0.0  0.0  0.0  0.0
 0.0  0.5  0.5  0.0  0.0  0.0
 0.0  0.0  0.5  0.5  0.0  0.0
 0.0  0.0  0.0  0.5  0.5  0.0
 0.0  0.0  0.0  0.0  0.5  0.5
 0.5  0.0  0.0  0.0  0.0  0.5

Note that $H(Y|X)=1$ and $H(Y)\le\log 6$. However, for this simple case, the equality holds when $p(x)=1/6[1,1,1,1,1,1]$, $1/3[1,0,1,0,1,0]$, or $1/3[0,1,0,1,0,1]$. Therefore, $$C=H(X)-H(Y|X)=\log 6-1=\log 3.$$

The lower bound is achieved when the five of the alphabets are mapped to the sixth one. For example,

channel_worst = 
0.5*[1 0 0 0 0 1; 
    0 1 0 0 0 1;
    0 0 1 0 0 1;
    0 0 0 1 0 1;
    0 0 0 0 1 1;
    0 0 0 0 1 1]

6×6 Array{Float64,2}:
 0.5  0.0  0.0  0.0  0.0  0.5
 0.0  0.5  0.0  0.0  0.0  0.5
 0.0  0.0  0.5  0.0  0.0  0.5
 0.0  0.0  0.0  0.5  0.0  0.5
 0.0  0.0  0.0  0.0  0.5  0.5
 0.0  0.0  0.0  0.0  0.5  0.5

Since $H(Y|X)=1$ is fixed, we need to maximize $H(Y)$ by choosing the optimal input distribution $p(x)$. Let $p(x)=(p_1,\ldots,p_6)$. Then, $p(y) = 1/2(p_1,p_2,p_3,p_4,p_5+p_6,1)$, and thus $$ \begin{align*} H(Y) &= H\left(\frac{p_1}{2},\frac{p_2}{2},\frac{p_3}{2},\frac{p_4}{2},\frac{p_5+p_6}{2}, \frac{1}{2}\right)\\ &= 1+\frac{1}{2} H(p_1,p_2,p_3,p_4,p_5+p_6)\\ &\le 1+\frac{1}{2}\log 5. \end{align*} $$ The equality can be achieved when $p(x)=(1/5,1/5,1/5,1/5,1/10,1/10)$. Hence, $$C=H(X)-H(Y|X)=1+\frac{1}{2}\log 5-1=\frac{1}{2}\log 5.$$

In general for any $m$, by the same reasoning, the upper and the lower bound are given as: $$ \frac{1}{2}\log(m-1) \le C\le \log \frac{m}{2}. $$

For $m=6$,

lower_bound, upper_bound = (log2(m-1)/2, log2(m/2))

(1.160964047443681,1.584962500721156)

(2) Exhaustive search by random sampling¶

Since the grid searching is time-inefficient, we can try a random sampling as an alternative. How can we generate uniformly random vectors over the $d$-dimensional probability simplex?

To generate a random probability vector $\vec{p}=(p_1,\ldots,p_d)$, we can follow the following simple procedure:

Take uniform i.i.d. random variables $U_1^{d-1} = (U_1,\ldots,U_{d-1})$.
Take an order statistic $(U_{(1)},U_{(2)},\ldots,U_{(d-1)})$ of $U_1^{d-1}$, i.e., sort the vector $U_1^{d-1}$.
Take $p=(U_{(1)}-U_{(0)},U_{(2)}-U_{(1)},\ldots,U_{(d)}-U_{(d-1)})$ where $U_{(0)}=0, U_{(d)}=1$.

One can easily check that this gives a uniform sampling over the probability simplex.

function genRandProbSimplex(m, num_samples) 
    prob_simplex = zeros(num_samples, m)
    for i in 1:num_samples
        u = sort(vec([0 rand(m-1)... 1]))
        prob_simplex[i, :] = [u[i]-u[i-1] for i in 2:m+1]'
    end
    return prob_simplex
end

genRandProbSimplex (generic function with 1 method)

Problem 1(c) revisited¶

m = 6
@time prob_simplex = genRandProbSimplex(m, 10^5);
niter = 100
@time capacity_list = [computeCapacity(genRandChannel(m), prob_simplex)[1] for i in 1:niter];

  1.981151 seconds (6.57 M allocations: 254.709 MB, 2.60% gc time)
 15.192880 seconds (199.99 M allocations: 15.275 GB, 12.79% gc time)

Caxis = lower_bound-0.1:1e-4:upper_bound+0.1
empcdf = zeros(length(Caxis))
for i in 1:length(Caxis)
    empcdf[i] = sum(capacity_list.<=Caxis[i])
end
empcdf /= length(capacity_list);
plot(Caxis, empcdf)
xlabel("Capacity")
ylabel("Probability")
title("CDF of capacities")
(minimum(capacity_list), maximum(capacity_list))

(1.4349326229523038,1.5848743494208768)

As you can see, this method is much faster than grid sampling. However, since the sampled probability simplex is crudely obtained, it might not contain the capacity achieving distributions.

We can speed up the computation considerably using the Blahut-Arimoto algorithm (referred to by some as the Csiszar-Tusnady algorithm). Moreover, this algorithm allows us to control the accuracy to which the capacity is computed.

(3) Csiszar-Tusnady & Blahut-Arimoto Algorithm¶

Csiszar-Tusnady algorithm & Blahut-Arimoto algorithm formulates the capacity computation problem as double maximization of the concave function, and does alternating maximization.

Suppose we have a channel $p(x|y)$. $$ \begin{align*} C&=\max_{r(x)} I(X;Y)\\ &=\max_{q(x|y)}\max_{r(x)}\sum_x\sum_y r(x)p(y|x)\log\frac{q(x|y)}{r(x)}. \end{align*} $$ First, for a given $r(x)$, the optimal $q(x|y)$ is given by $$ q^*(x|y) \propto r(x)p(y|x). $$ Secondly, for a given $q(x|y)$, the optimal $r(x)$ is given by $$ r^*(x) \propto \prod_y q(x|y)^{p(y|x)}. $$ To translate these into an algorithm, we perform the above two steps alternatively. We can use the following for the stopping criterion: $$ C - I^{(n+1)}(X;Y) \le \max_x \log \frac{r^{(n+1)}(x)}{r^{(n)}(x)} $$

For more details, we refer you to the following references:

Section 8, Chapter 10, Elements of Information Theory, Cover and Thomas
http://www.rle.mit.edu/rgallager/documents/notes4.pdf

function computeCapacityCTBA(channel, th)
    m = size(channel, 1)
    r = genRandProbSimplex(m, 1)  # initialize r(x)
    q = zeros(1, size(channel, 2))
    err = 1;
    th = min(0.5, th);
    while err > th
        rold = r
        q = r.*channel'  # find the optimal q while fixing r
        q ./= sum(q,2)  # normalize
        r = prod(q.^channel', 1)  # find the optimal r while fixing q
        r ./= sum(r)  # normalize
        err = maximum(log2(r./rold))  # update error
    end
    return (mutualInfo(r, channel), r)
end

computeCapacityCTBA (generic function with 1 method)

For example,

channel =
[0.4 0.4 0.2;
    0.3 0.3 0.4;
    0.2 0.5 0.3;
    0.2 0.2 0.6]

4×3 Array{Float64,2}:
 0.4  0.4  0.2
 0.3  0.3  0.4
 0.2  0.5  0.3
 0.2  0.2  0.6

computeCapacityCTBA(channel, 1e-6)

(0.12462793754178048,
[0.515225 9.70584e-111 0.000490485 0.484285])

Note that although the input distribution is still far from the capacity achieving distribution, the capacity is well approximated up to 1e-6.

Problem 1(c) revisited¶

Since this algorithm is much faster and more accurate than exhaustive search or random sampling, we can do more iterations to obtain a descent cdf.

m = 6
@time prob_simplex = genRandProbSimplex(m, 10^5);
niter = 10000
@time capacity_list = [computeCapacityCTBA(genRandChannel(m), 1e-5)[1] for i in 1:niter];

  1.591747 seconds (6.30 M allocations: 242.607 MB, 2.73% gc time)
 33.673401 seconds (944.97 M allocations: 30.823 GB, 13.49% gc time)

Caxis = lower_bound-0.1:1e-4:upper_bound+0.1
empcdf = zeros(length(Caxis))
for i in 1:length(Caxis)
    empcdf[i] = sum(capacity_list.<=Caxis[i])
end
empcdf /= length(capacity_list);
plot(Caxis, empcdf)
xlabel("Capacity")
ylabel("Probability")
title("CDF of capacities")
(minimum(capacity_list), maximum(capacity_list))

(1.16096404743209,1.5849625007211565)