In this homework, we will implement mixture probability assignments such as Krichevski-Trofimov (KT) and Laplace mixtures. Then, we will implement context tree weighting (CTW) using the KT probability assignment as a building block.
[1] Willems, Frans MJ, Yuri M. Shtarkov, and Tjalling J. Tjalkens. "The context-tree weighting method: basic properties." IEEE Transactions on Information Theory 41.3 (1995): 653-664.
[2] Willems, F. M., Tjalkens, T. J., and Ignatenko, T. (2006). Context-tree weighting and maximizing: Processing betas. In Proceedings of Information Theory Applications Workshop.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# You may find this function useful
def chars_to_dict(chars):
chars = sorted(list(set(chars)))
char_to_index = {x: i for i, x in enumerate(chars)}
return char_to_index
# CODES FROM HW1
def draw_one_sample_from(pmf):
pmf = np.array(pmf)
pmf = pmf / sum(pmf)
return np.random.choice(range(len(pmf)), p=pmf)
def generate_mc(n, P, pinit=None):
'''
- n: length of output sequence
- P: transition matrix
- pinit: initial distribution
'''
x = np.zeros(n, dtype=np.int)
log2_prob = 0
if pinit is None:
x[0] = np.random.randint(P.shape[0])
p0 = 1/P.shape[0]
else:
x[0] = draw_one_sample_from(pinit)
p0 = pinit[x[0]]
log2_prob += np.log2(p0)
for i in range((int(n)-1)):
x[i+1] = draw_one_sample_from(P[x[i], :])
log2_prob += np.log2(P[x[i],x[i+1]])
return x, log2_prob
class Node(object):
def __init__(self):
self.data = None
self.children = []
def add_child(self):
self.children.append(Node())
def add_children(self, num_child):
for _ in range(num_child):
self.add_child()
def prob_LZ78_seq(string, char_to_index):
"""
- string: str, a sequence to be parsed
- char_to_index: dict, a dictionary which maps a character to its index
"""
num_abc = len(char_to_index)
dist_array = np.zeros((len(string)+1, num_abc)) # i-th row is the sequential distribution
root = Node()
root.add_children(num_abc)
root.data = np.ones(num_abc)
node = root
for i in range(len(string)):
dist_array[i] = node.data
new_symbol_index = char_to_index[string[i]]
next_node = node.children[new_symbol_index]
node.data[new_symbol_index] += num_abc-1
if next_node.children: # if next_node is not a leaf
node = next_node
else: # if next_node is a leaf
next_node.add_children(num_abc)
next_node.data = np.ones(num_abc)
node = root
dist_array[-1] = node.data # q_{LZ}(x_{n+1}|x^n)
dist_array /= dist_array.sum(axis=1)[:, np.newaxis] # normalization
return dist_array
def process_abc(abc):
if type(abc) is dict:
char_to_index = abc
elif type(abc) in [str, list, set]:
char_to_index = chars_to_dict(abc)
else:
raise ValueError('Input a valid alphabet')
return char_to_index
In this problem, we implement a sequential probability assignment based on the mixture of i.i.d. multinomial distributions with respect to the prior $\text{Dirichlet}(\alpha,\ldots,\alpha)$. Recall that the sequential probability assignment for this mixture simply reduces to the add-$\alpha$ estimator. For example, if $\alpha=1/2$, it becomes the KT mixture, and if $\alpha=1$, it becomes the Laplace mixture.
Write a function
prob_mixture_iid_seq(string, char_to_index, alpha=1/2)
which takes string, char_to_index, and alpha as inputs, and outputs the sequential distribution dist_array.
Here, alpha is a float variable, the smoothing parameter.
def prob_mixture_iid_seq(string, abc, alpha=1/2):
char_to_index = process_abc(abc)
x = [char_to_index[char] for char in string]
cnts = np.zeros(len(abc)) + alpha
dist_array = np.zeros((len(string)+1, len(abc)))
for i in range(len(string)):
dist_array[i] = cnts
cnts[x[i]] += 1
dist_array[-1] = cnts
dist_array /= dist_array.sum(axis=1)[:, np.newaxis] # normalization
return dist_array
A = '10111111'
B = '0010110111'
C = '00121212102101210'
D = 'AGTTTTCGTAACGTT'
prob_mixture_iid_seq(A, '01', alpha=1/2)
prob_mixture_iid_seq(B, '01', alpha=1)
prob_mixture_iid_seq(B, '012', alpha=1/2)
prob_mixture_iid_seq(C, '012')
prob_mixture_iid_seq(D, 'AGTC')
Now we implement a sequential probability assignment based on the mixture of $k$-th order Markov probabilities. More precisely, as we discussed in the class, given an initial state $s_1=x_{-k-1}^0$, we assign the following probability to a given string $x_1^n$: $$ q_{k,\alpha}(x^n|s_1) = \prod_{s\in \mathcal{X}^k}q_{\alpha}^s(x^n|s_1), $$ where $q_{\alpha}^s(x^n|s_1)$ is the add-$\alpha$ probability assingment for the substring of $x^n$ having a state $s\in\mathcal{X}^k$. In words, given a Markov order $k$, we simply decompose the given sequence into subsequences according to the Markov states of length $k$, and assign simple mixture probabilities for each substring. For simplicity, let us assume $s_1$ to be $k$ consecutive 0's given the alphabet is $\{0,1,\ldots,M-1\}$, and find the probability $q_{k,\alpha}(x^n|s_1)$.
Write a function
prob_mixture_markov_seq(string, char_to_index, alpha=1/2, k=0)
which takes string, char_to_index, alpha, k as inputs, and outputs the sequential distribution dist_array of the add-$\alpha$ probability of the input string $x^n$ with respect to the $k$-th order Markov model.
Here, k is a int variable representing the Markov order.
def prob_mixture_markov_seq(string, abc, alpha=1/2, k=0, init_state=None):
if k == 0:
return prob_mixture_iid_seq(string, abc, alpha=alpha)
char_to_index = process_abc(abc)
index_to_char = {v: key for key,v in char_to_index.items()}
dist_array = np.zeros((len(string)+1, len(abc)))
counter = {} # key=state, value=counts
suffix = index_to_char[0]*k if init_state is None else init_state
for i in range(len(string)+1):
counter[suffix] = counter.get(suffix, np.zeros(len(abc)) + alpha)
dist_array[i] = counter[suffix]
if i < len(string):
counter[suffix][char_to_index[string[i]]] += 1
suffix = suffix[1:] + string[i]
dist_array /= dist_array.sum(axis=1)[:, np.newaxis] # normalization
return dist_array
A, prob_mixture_markov_seq(A, '012', alpha=1/2, k=0)
A, prob_mixture_markov_seq(A, '012', alpha=1/2, k=1)
A, prob_mixture_markov_seq(A, '012', alpha=1/2, k=2)
B, prob_mixture_markov_seq(B, '01', alpha=1/2, k=1)
B, prob_mixture_markov_seq(B, '012', alpha=1/2, k=2)
C, prob_mixture_markov_seq(C, '012', alpha=1/2, k=3)
C, prob_mixture_markov_seq(C, '012', alpha=1/2, k=5)
We implement context tree weighting algorithm in this problem. Recall that $$ q^s_{\text{CTW}}(x^n)= \begin{cases} \displaystyle\frac{1}{2}q^s_{\text{KT}}(x^n) +\frac{1}{2}\prod_{y\in \mathcal{X}}q^{ys}_{\text{CTW}}(x^n) &\text{if }|s|<k\\ \displaystyle q^s_{\text{KT}}(x^n) &\text{if }|s|=k \end{cases} $$
Write a function
prob_ctw_seq(string, chars)
that outputs a sequential distribution dist_array produced by CTW.
Note that a naive implementation of sequential probability assignment \begin{align} q^{\lambda}_{\text{CTW}}(\alpha|x^n) = \frac{q^{\lambda}_{\text{CTW}}(x^n\alpha)}{q^{\lambda}_{\text{CTW}}(x^n)} \end{align} for each $\alpha\in\mathcal{X}$ using the above recursion would be really slow.
The following is a summary of Section 2 of [2], which allows us a much faster implementation.
For a given maximum depth $D$, let us assume that we are given a suffix $s_1=x_{-D-1}^{0}$, and omit the conditioning for simplicity. That is, we write $q^{\lambda}_{\text{CTW}}(\alpha|x^n)=q^{\lambda}_{\text{CTW}}(\alpha|x_{-D-1}^0x_1^n)$.
For each $n=0,1,\ldots,N$, we would like to find $q_{\text{CTW}}^{\lambda}(\cdot|x^n)$, where $\lambda$ denotes the empty string. Suppose $|s|<D$. From the recursion formula, we can write \begin{align} q_{\text{CTW}}^s(\alpha|x^n) &=\frac{q_{\text{CTW}}^s(x^n\alpha)}{q_{\text{CTW}}^s(x^n)}\\ &=\frac{q^s_{\text{KT}}(x^n\alpha)+\prod_{y\in \mathcal{X}}q^{ys}_{\text{CTW}}(x^n\alpha)}{q^s_{\text{KT}}(x^n)+\prod_{y\in \mathcal{X}}q^{ys}_{\text{CTW}}(x^n)}\\ &=\frac{\beta^s(x^n)q_{\text{KT}}^s(\alpha|x^n)+q_{\text{CTW}}^{x_{n-|s|}~ ~ ~s}(\alpha|x^n)}{\beta^s(x^n)+1}, && (\because q^{ys}_{\text{CTW}}(x^n\alpha)=q^{ys}_{\text{CTW}}(x^n) \text{ if } y\neq x_{n-|s|}) \end{align} where $\beta^s(x^n)$ is defined as the ratio $$ \beta^s(x^n)=\frac{q_{\text{KT}}^s(x^n)}{\prod_{y\in\mathcal{X}}q_{\text{CTW}}^{ys}(x^n)}. $$ Hence, $q_{\text{CTW}}^{\lambda}(\cdot|x^n)$ can be found recursively as follows: $$ \begin{align} q_{\text{CTW}}^s(\cdot|x^n)= \begin{cases} \displaystyle\frac{\beta^s(x^n)q_{\text{KT}}^s(\cdot|x^n)+q_{\text{CTW}}^{x_{n-|s|}~ ~ ~s}(\cdot|x^n)}{\beta^s(x^n)+1} & \text{if }|s|<D\\ q_{\text{KT}}^s(\cdot|x^n) & \text{if }|s|=D \end{cases}, \end{align} $$ After obtaining the conditional distribution, the ratio $\beta^s(\cdot)$ for each $s$ needs to be updated with the new symbol $x_{n+1}$ if $n\le N-1$ as follows: \begin{align} \beta^s(x^nx_{n+1}) = \beta^s(x^n) \frac{q_{\text{KT}}^s(x_{n+1}|x^n)}{q_{\text{CTW}}^{x_{n-|s|}~ ~ ~s}(x_{n+1}|x^n)}, &&\text{if }|s|<D. \end{align}
To implement this, use any tree data structure storing the counter of the symbols and the ratio $\beta^s$ for each node $s$.
We extend the existing ```Node``` class to make the CTW implementation easier.
class NodeCTW(Node):
def __init__(self, num_abc, depth=0, alpha=1/2):
Node.__init__(self)
self.depth = depth
self.alpha = alpha
self.num_abc = num_abc
self.data = {'counter': np.zeros(num_abc)+alpha, 'log2beta': 0}
def add_child(self):
self.children.append(NodeCTW(self.num_abc, depth=self.depth+1, alpha=self.alpha))
def add_children(self):
Node.add_children(self, self.num_abc)
def prob_ctw_seq(string, abc, alpha=1/2, max_depth=5, init_state=None):
"""
- string: str, a sequence to be parsed
- char_to_index: dict, a dictionary which maps a character to its index
"""
char_to_index = process_abc(abc)
abc_size = len(char_to_index)
x = [char_to_index[char] for char in string] # convert a string into a list of indices
dist_array = np.zeros((len(x)+1, abc_size)) # i-th row is the sequential distribution
root = NodeCTW(abc_size, depth=0, alpha=alpha)
suffix = [0] * max_depth if init_state is None else init_state # initial conditioning
for i in range(len(x)+1):
new_symbol = x[i] if i < len(x) else None
dist_array[i] = prob_ctw_seq_recursion(root, suffix, new_symbol=new_symbol)
if max_depth > 0:
suffix = suffix[1:] + [new_symbol]
return dist_array
def prob_ctw_seq_recursion(node, suffix, new_symbol=None):
# suffix has the length of max_depth
assert node.depth <= len(suffix)
qKT_node_cond = node.data['counter']/node.data['counter'].sum()
if new_symbol is not None:
node.data['counter'][new_symbol] += 1
if node.depth == len(suffix):
return qKT_node_cond
else:
if not node.children:
node.add_children()
# 1) Find the next symbol conditional distribution
next_suffix_symbol = suffix[-node.depth-1] # equivalent to the child node
qCTW_child_node_dist = prob_ctw_seq_recursion(node.children[next_suffix_symbol], suffix, new_symbol=new_symbol)
beta = 2**node.data['log2beta']
qCTW_node_cond = (beta*qKT_node_cond + qCTW_child_node_dist)/(beta+1)
# 2) Update the beta if there is new_symbol
if new_symbol is not None:
node.data['log2beta'] += np.log2(qKT_node_cond[new_symbol]/qCTW_child_node_dist[new_symbol])
return qCTW_node_cond
prob_mixture_iid_seq(A, '01', alpha=1/2)
prob_ctw_seq(A, '01', max_depth=0)
prob_ctw_seq(A, '01', max_depth=1)
prob_ctw_seq(A, '01', max_depth=2)
prob_ctw_seq(A, '01', max_depth=5)
prob_ctw_seq(B, '01', max_depth=2)
prob_ctw_seq(B, '01', max_depth=5)
prob_ctw_seq(B, '012', max_depth=5)
prob_ctw_seq(D, 'AGTC', max_depth=1)
In this problem, we will verify universality and compare the speed of convergence of the mixture strategies. As previously in Homework #1, we would like to verify that $$ \lim_{n\to\infty}\mathbb{E}_{X^n\sim \mathsf{P}}\left[\log_2\frac{p(X^n)}{q(X^n)}\right]=0 $$ for some distribution $\mathsf{P}$.
To verify this, we draw $n$ independent random variables $X^n(i)\sim \mathsf{P}$ independently for $i=1,\ldots,N$ to obtain $$ \frac{1}{N}\sum_{i=1}^N \log_2\frac{p(X^n(i))}{q(X^n(i))}\approx \mathbb{E}_{X^n\sim \mathsf{P}}\left[\log\frac{p(X^n)}{q(X^n)}\right]. $$ We will consider the following two simple probability model classes.
a) $\mathcal{P_a}$={a Bernoulli($p$) i.i.d. process $X_i\sim \text{Bern}(p)$: $p\in[0,1]$}
b) $\mathcal{P_b}$={a binary symmetric Markov chain $X^n$ of transition matrix $$ P=\begin{bmatrix} 1-p & p\\ p & 1-p \end{bmatrix}, $$ starting from the initial distribution $\pi=[1/2,1/2]$: $p\in[0,1]$}
For the class of Bernoulli processes $\mathcal{P_a}$, find the approximate mean minimax redundancy $$ \frac{1}{n}\max_{p\in \{0.02,0.04,\ldots,0.5\}}\frac{1}{N}\sum_{i=1}^N \log_2\frac{p(X^n{(i)})}{q(X^n{(i)})} $$ for KT and Laplace mixture of $k$-th Markov order probabilities ($k\in\{0,1,2\}$), LZ probability assignment, CTW with max_depth$\in\{1,2\}$, for $n\in \{10, 50, 100, 500, 1000, 5000\}$, while fixing $N=50$.
Plot $\log_{10} n$ vs. $\log_{10}$ of the approximate mean minimax redundancy curve for each probability assignment in the same plot. There should be 9 curves in the same plot. Which probability assignment gives the fastest convergence rate?
For the class of binary symmetric Markov processes $\mathcal{P_b}$, repeat the same experiment in Problem 4(a).
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 6)
def log2_prob(string, char_to_index, prob_assign_seq_func):
"""
- string: str, a sequence to be parsed
- char_to_index: dict, a dictionary which maps a character to its index
- prob_assign_seq: a function takes string, and char_to_index as inputs
"""
dist_array = prob_assign_seq_func(string, char_to_index)
log2prob = np.sum([np.log2(dist_array[i, char_to_index[string[i]]]) for i in range(len(string))])
return log2prob
def log2_prob_mixture_markov(string, char_to_index, alpha=1/2, k=1):
return log2_prob(string, char_to_index, lambda x, y: prob_mixture_markov_seq(x, y, alpha=alpha, k=k))
def log2_prob_CTW(string, char_to_index, max_depth=5):
return log2_prob(string, char_to_index, lambda x, y: prob_ctw_seq(x, y, max_depth=max_depth))
def log2_prob_LZ78(string, char_to_index):
return log2_prob(string, char_to_index, prob_LZ78_seq)
def mean_redundancy(log2_prob_assign_func, nlist, plist, N=50, model="iid"):
M = 2 # let's focus on the binary alphabet
red_array = np.zeros((len(plist), len(nlist)))
for i, p in enumerate(plist):
print('model={}, {}-th p={}'.format(model,i+1,p))
if model == "iid":
P = np.array([[1-p, p], [1-p, p]])
pinit = [1-p, p]
elif model == "bsmc":
P = np.array([[1-p, p], [p, 1-p]])
pinit = None
else:
raise ValueError("Input a valid model class either 'iid' or 'bsmc'")
for j, n in enumerate(nlist):
redundancy = 0
for __ in range(N):
x, log2prob = generate_mc(n, P, pinit)
xstr = ''.join([str(xi) for xi in x])
log2prob_assigned = log2_prob_assign_func(xstr, chars_to_dict([str(k) for k in range(M)]))
redundancy += log2prob - log2prob_assigned
red_array[i,j] = redundancy/N/n
return red_array
%%time
nlist = [10, 50, 100, 500, 1000, 5000]
plist = np.arange(0.02, 0.51, 0.02)
model_list = ["iid", "bsmc"]
prob_assignment_name_list = ["Markov(k=0), KT",
"Markov(k=1), KT",
"Markov(k=2), KT",
"Markov(k=0), Laplace",
"Markov(k=1), Laplace",
"Markov(k=2), Laplace",
"Lempel-Ziv",
"CTW(depth=1)",
"CTW(depth=2)"]
log2_prob_assignment_func_list = [lambda x,y: log2_prob_mixture_markov(x,y,k=0,alpha=1/2),
lambda x,y: log2_prob_mixture_markov(x,y,k=1,alpha=1/2),
lambda x,y: log2_prob_mixture_markov(x,y,k=2,alpha=1/2),
lambda x,y: log2_prob_mixture_markov(x,y,k=0,alpha=1),
lambda x,y: log2_prob_mixture_markov(x,y,k=1,alpha=1),
lambda x,y: log2_prob_mixture_markov(x,y,k=2,alpha=1),
log2_prob_LZ78,
lambda x,y: log2_prob_CTW(x,y,max_depth=1),
lambda x,y: log2_prob_CTW(x,y,max_depth=2)]
red_array_dict = {}
for model in model_list:
red_array_dict[model] = [np.zeros((len(plist), len(nlist))) for __ in range(len(log2_prob_assignment_func_list))]
for i, log2_prob_assignment in enumerate(log2_prob_assignment_func_list):
print(model, prob_assignment_name_list[i])
red_array_dict[model][i] = mean_redundancy(log2_prob_assignment, nlist, plist, N=5, model=model)
import itertools
for i, model in enumerate(model_list):
plt.subplot(1,2,i+1)
markers = itertools.cycle((',', '+', '.', 'x', '*'))
for j in range(len(red_array_dict[model])):
plt.loglog(nlist, red_array_dict[model][j].max(axis=0),
marker=next(markers),
label=prob_assignment_name_list[j])
plt.legend(loc=3)
plt.title('Worst expected redundancy for '+model)
plt.xlabel('$\log_{10}$(length of sequence $n$)')
plt.ylabel('$\log_{10}$(expected redundancy [bits])')
Note that the slope of each curve represent the exponent of the convergence rate of the weighting scheme. Except the slow rate achieved by LZ, all weighting schemes have the similar rates.
For each $n\in \{500, 1000, 5000\}$, find the approximate mean minimax redundancy for each add-$\alpha$ mixtures for $\alpha\in\{0.1,0.2,\ldots, 2.9,3\}$, and plot the values with respect to $\alpha$. There should be 3 curves in the same plot. Which $\alpha$ does achive the minimum mean minimax redundancy?
%%time
nlist = [500, 1000, 5000]
plist = plist = np.arange(0.02, 0.51, 0.04)
alpha_list = list(np.arange(0.1, 3, 0.1))
red_array_dict = {}
for model in model_list:
red_array_dict[model] = np.zeros((len(alpha_list), len(nlist)))
for i, alpha in enumerate(alpha_list):
if model is "iid":
log2_prob_assignment_func = lambda x,y: log2_prob_mixture_markov(x,y,k=0,alpha=alpha)
else:
log2_prob_assignment_func = lambda x,y: log2_prob_mixture_markov(x,y,k=1,alpha=alpha)
red_array_dict[model][i, :] = mean_redundancy(log2_prob_assignment_func,
nlist, plist, N=10, model=model).max(axis=0)
for i, model in enumerate(model_list):
plt.subplot(1,2,i+1)
plt.title('Worst expected redundancy for '+model)
plt.xlabel(r'$\alpha$')
plt.ylabel('$\log_{10}$(expected redundancy [bits])')
markers = itertools.cycle((',', '+', '.', 'x', '*'))
for j in range(len(nlist)):
plt.plot(alpha_list, np.log10(red_array_dict[model][:, j]),
marker=next(markers),
label=nlist[j])
plt.legend(loc=3)
Note that the right plot has been obtained with the Markov mixture of order $k=1$.
In this problem, we compress the same data in Homework #1.
max_depth, use 5 or 10. Considering time complexity, what would be a good choice in practice?from arithmetic_enc_public import *
def compress_with_CTW(filename, char_to_index=None, max_depth=5, verbose=False):
file = open(filename, 'r', encoding="utf8")
string = file.read()
file.close()
if char_to_index is None:
char_to_index = chars_to_dict(string)
dist_array = prob_ctw_seq(string, char_to_index, max_depth=max_depth)
encoded_bits = arithmetic_enc_offline(string, char_to_index, dist_array)
if verbose:
compress_ratio = len(encoded_bits) / (len(string) * np.log2(len(char_to_index)))
print("The compression ratio is {}%.".format(compress_ratio*100))
return compress_ratio
filename = 'data/CElegans_sample.txt'
alphabet = 'AGTC'
max_depth_list = range(20)
compress_ratio_list = np.zeros(len(max_depth_list))
for i, max_depth in enumerate(max_depth_list):
print("# For max_depth={}".format(max_depth))
print("\tFilename '{}': ".format(filename), end = '')
%time compress_ratio_list[i] = compress_with_CTW(filename, chars_to_dict(alphabet),\
max_depth=max_depth, verbose=True);
plt.plot(max_depth_list, compress_ratio_list, marker='*')
plt.xlabel('max_depth')
plt.ylabel('compression ratio')
The minimum compression ratio can be (almost) achieved with max_depth = 5.
nums = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
en_abc = {' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
fr_letters = {'é',
'à', 'è', 'ù',
'â', 'ê', 'î', 'ô', 'û',
'ç',
'ë', 'ï', 'ü'}
filename_list = ['data/sample_the_little_prince_en.txt',
'data/sample_the_little_prince_fr.txt']
alphabet_list = [en_abc, en_abc|fr_letters]
max_depth_list = range(10)
compress_ratio_dict = {filename: np.zeros(len(max_depth_list)) for filename in filename_list}
for i, max_depth in enumerate(max_depth_list):
print("# For max_depth={}".format(max_depth))
for filename, alphabet in zip(filename_list, alphabet_list):
print("\tFilename '{}': ".format(filename), end = '')
%time compress_ratio_dict[filename][i] = compress_with_CTW(filename, chars_to_dict(alphabet),\
max_depth=max_depth, verbose=True);
for filename in filename_list:
plt.plot(max_depth_list, compress_ratio_dict[filename], label=filename, marker='*')
plt.legend(loc=1)
The minimum compression ratio can be achieved with max_depth = 5 as before for the genome sequence. Note that CTW works much better than LZ probability assignment. Interestingly, with a maximum depth large enough, the compression ratios become almost the same!