\documentclass{article}
\title{CMPE 598 - Lecture 4}
\date{Feb 27, 2018}
\author{Scribe : H\"{u}seyin Bilge Ya\u{g}c{\i}}
\usepackage{geometry}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{physics}
\usepackage{float}
\usepackage{tikz}
\usetikzlibrary{angles,quotes}
\usepackage{amsthm}
\geometry{a4paper}
\pagenumbering{arabic}
\begin{document}
\maketitle
%Recall section
\section{Recall from last week}
In last lecture, we had proved that for promise problems, QFAs are economical in number of states, compared to DFA's. Up to now, we depended on exactness on QFAs, but exactness will be disturbed in the upcoming lectures.
\paragraph{Today's topic}
What about general language recognition, without promises? In this setting, any string can appear and the machine must respond correctly. We will try to find whether QFAs have advantages - state economy or functionality - compared to DFA's in general language recognition, or not.
%Distinguishability
\section{Distinguishability and Myhill-Nerode Theorem}
\theoremstyle{definition}
\newtheorem*{dist}{Distinguishability}
\begin{dist}
Let $x$ and $y$ be strings, and let $L$ be any language. We say that $x$ and $y$ are distinguishable by $L$ if some string $z$ exists such that exactly one of the strings $xz$ and $yz$ is a member of $L$. If $x$ and $y$ are not distinguishable by any $z$, i.e. if for every $z$, we have $xz$ $\in$ $L$ iff $yz$ $\in$ $L$; we denote this with $x$ $\equiv_L$ $y$.
\end{dist}
Distinguishability is an equivalence relation, and the set of all strings are divided into equivalence classes by $L$.
\theoremstyle{definition}
\newtheorem*{defin}{Index of a language}
\begin{defin}
Let $L$ be a language. We define \textbf{index} of $L$ as the maximum number of elements in any set that are pairwise distinguishable by $L$.
\end{defin}
\paragraph{Examples}
\begin{itemize}
\item Let $\Sigma_1 = \{0,1\}$, $ L_1 = \{ l |$ $l$ ends with 1\}. For $x$ = 1, $y$ = 0, $z$ = $\epsilon$ ; $yz$ $\in$ $L_1$, but $xz$ $\not\in$ $L_1$; thus $x$ and $y$ are distinguishable. In this context, $w$ = 110001 and $u$ = 11 are indistinguishable and $w \equiv_{L_1} u$. Here, the strings with the same ending symbol belong to the same equivalence class, i.e. $L_1$ has an index of 2.
\item Let $\Sigma_2 = \Sigma_1, L_2 = \{ l |$ $l$ contains equal amounts of 1's and 0's\}. $x$ = 010 and $y$ = 100 are indistinguishable by $L_2$. Here, the strings are divided into classes with respect to the discrepancy between 1's and 0's inside them; therefore, index of $L_2$ is infinite.
\item Let $\Sigma_3 = \{1\}, L_3 = \{11,111\}$. The elements of $\{ \epsilon, 1, 11, 111, 1111 \}$ are pairwise distinguishable and thus form equivalence classes, but all the remaining strings are indistinguishable from 1111 by $L_3$.
\end{itemize}
\newtheorem*{lemma}{Lemma}
\begin{lemma}
If $L$ is recognizable by a DFA with k states, then it has index at most k.
\end{lemma}
\begin{proof}
Assume the index of $L$ is greater than $k$, which means that there are at least $k+1$ strings that are pairwise distinguishable. Due to pidgeon-hole principle, at least two of the distinguishable strings must bring the machine to the same state. The strings become indistinguishable, therefore the assumption contradicts itself.
\end{proof}
\newtheorem*{lemma2}{Lemma}
\begin{lemma2} If the index of language $L$ is a finite number $k$, then it is recegnized by a DFA with $k$ states.\end{lemma2}
\begin{proof} Let $x_i = \{S_0, S_1, \dots ,S_{k-1} \}$ be pairwise distinguishable by $L$. Let D be the DFA with
\begin{align*}
&D= (Q,\Sigma, \delta, q_0, F) \\ &Q = (q_0, q_1, \dots , q_{k-1} ) \\ &\delta(q_i,a) \longrightarrow q_j \text{, where } S_{i}a \equiv_L S_{j} \text{ for any } a \in \Sigma \\ &F = \{q_i | S_i \in L \} \\ &q_0 \text{ is such that } S_i \equiv_L \epsilon.
\end{align*}
Every state in $D$ corresponds to an equivalence class, thus $D$ recognizes $L$.
\end{proof}
\theoremstyle{plain}
\newtheorem*{myhill}{Myhill - Nerode Theorem}
\begin{myhill} A language L is regular iff it has finite index. \emph{Moreover, index of $L$ is the size of the smallest DFA recognizing $L$. Proofs are omitted.} \end{myhill}
%Communication Complexity
\section{Communication Complexity}
\paragraph{}
Communication complexity tries to quantify the minimum number of bits to be shared between two parties solving a certain problem. Let Alice and Bob be two individuals. Alice is given string $x$, where Bob is given $y$. Their aim is to figure out if string $xy$ belongs to a certain language $L$. What is the smallest message that Alice can send to Bob to transfer information? The trivial solution to this is sending the full string $x$ to Bob, but generally there exists a shorter message for this job. We will define \textbf{ one-way communication complexity} of language $L$ as the minimum number of bits that has to be sent by Alice. We will mention some concepts from information theory to solve this problem. Ultimately, our aim is to compare quantum and classical communication complexity.
\subsection{Information Theoretical concepts}
\theoremstyle{definition}
\newtheorem*{shannon}{Shannon entropy}
\begin{shannon} The Shannon entropy $\boldsymbol{H(B)}$ of a set of messages, described with random variable $B$ coresponds to the average number of classical bits required to encode the members of this set. For a set with $n$ members and probability distribution $p_i = \{p_1, p_2, \dots , p_n \}$, Shannon entropy is defined as \begin{equation} \label{eq:1} \boldsymbol{H(B)} \triangleq -\sum_{x=1}^{n} {p_x}{\log_2{p_x}} \end{equation}
\end{shannon}
Shannon entropy can be seen as the number of bits needed to represent a given set fully. From the definiton of $ \boldsymbol{H(B)}$, we can see that $ \boldsymbol{H(B)}$ is maximized when \textbf{B} is uniformly distributed, in which we need $\log_2{n}$ bits; and zero when there is no uncertainty in \textbf{B}'s outcome.
\newtheorem*{vNeumann}{von Neumann entropy}
\begin{vNeumann}
For quantum systems, von Neumann came up with quantum version of the entropy, since the density matrix is somewhat a probability distribution.
von Neumann entropy $\boldsymbol{S(\rho_B)}$ is defined as
\begin{equation} \label{eq:2} \boldsymbol{S(\rho_B)} \triangleq -tr(\rho_B\log_2{\rho_B}) \end{equation}
\end{vNeumann}
von Neumann entropy boils down to the number of quantum bits (qubits) needed to represent the set. $\boldsymbol{S(\rho_B)}$ is maximized when the distribution is uniform, and zero when the states are "pure", i.e. when we have complete knowledge of the system and which state it is in.
\subsection{DFA communication complexity}
Consider the infinite two-dimensional matrix $\mu$, with the rows and columns represent $x$ and $y$'s, $x,y \in \Sigma^\ast$, $\Sigma = \{0,1\}$. Define $\mu(x,y)$ such that $\mu(x,y) = 1$ iff $x,y \in L$, and 0 otherwise. Let $L = \{w | w$ ends with 1$\}$ for now.
$$\begin{array}{l||c|c|c|c|c|c|c|c}
\boldsymbol{\mu} & \epsilon & 0 & 1 & 00 & 01 & 10 & 11 & \dots \\
\hline \hline
\epsilon&0&0&1&0&1&0&1&\dots \\ \hline 0&0&0&1&0&1&0&1&\dots \\ \hline 1&1&0&1&0&1&0&1&\dots \\ \hline 00&0&0&1&0&1&0&1&\dots \\ \hline 01&1&0&1&0&1&0&1&\dots \\ \hline 10&0&0&1&0&1&0&1&\dots \\ \hline 11&1&0&1&0&1&0&1&\dots \\ \hline \vdots &\vdots &\vdots &\vdots &\vdots &\vdots &\vdots &\vdots &\ddots
\end{array}$$
For regular $L$, as seen in this case, matrix $\mu$ has a finite number of distinct rows, corresponding to equivalence classes. Any string $x$ that can be plugged will bring the DFA into one of the finite states. Identical rows mean \emph{indistinguishability}. Alice's job will be only telling Bob which type of unique row $x$ is in. The only information needed is \emph{"$X$ brought me to that state"}. For this particular language, we only need 1 bit because index of $L$ is two. Similarly, for $n$ states, we need $\log_2{n}$ bits.
\subsection{QFA communication complexity} Any QFA with $q$ states can be simulated by Alice and Bob, with Alice sending the state of the QFA after processing $x$, which is the procedure we applied in DFA case. Alice sends $\log_2{q}$ qubits to Bob and it will be sufficient. We are interested in whether the size of QFA is smaller than DFA or not. \emph{(Spoilers: We will show that one-way quantum communication complexity of regular language $L$ with index $d$ is $\log_2{d}$, then conclude that no QFA with fewer than $d$ states exists for this job.)} If the index of regular language $L$ is $d$, reduce the communication matrix to $d$ distinct rows. The information to be sent to Bob is the mixed state of uniformly randomly distributed rows. \newpage Imagine the rows being chosen randomly bit by bit, i.e. column by column. Let $p(0)$ be the probability of a 0 in the first column (i.e. \(\displaystyle \frac{\text{ \#0's in the first column}}{d} \) ). Then 0 is chosen with probability $p(0)$ and 1 is chosen with $1-p(0)$. Partition the rows to the sets $I_0$ and $I_1$, the sets of rows starting with 0's and 1's, respectively. If $b$ is chosen for the first transmitted bit, then the process continues with set $I_b$ and the next column. If a complete row $X$ is determined, let $\boldsymbol{\rho_X}$ denote the density matrix of just the message about that row. Let $\boldsymbol{\rho_t}$ denote the density matrix of possibly mixed message corresponding to a row, starting with $t$ chosen uniformly among all such rows. \paragraph{}
The probability that a $b$ is chosen after $t$ is called $\boldsymbol{p_t(b)}$. The associated RV is called $B$, and the number of different rows beginnşng with $t$ is called $\boldsymbol{row_t}$. Bob can decide membership of $xy$ in $L$ correctly with $p=1$, so he receives exact information about the row corresponding to $X$ in the message sent by Alice.
\theoremstyle{plain}
\newtheorem*{holevo}{Holevo Theorem}
\begin{holevo}
Suppose Alice prepares a quantum state $\rho_x$ where $x =\{ 0, 1, \dots , n\}$ with probability $p_i$ = $\{p_0, p_1, \dots , p_n\}$; and then gives it to Bob. Bob performs a measurement on that state, with measurement outcome $Y$. \textbf{The Holevo Bound} states that for any such measurement Bob may do,
\begin{equation} \label{eq:3} \boldsymbol{H(X:Y)} \le \boldsymbol{S(} \rho \boldsymbol{)} - \sum_{x} p_x \boldsymbol{S(} \rho_x \boldsymbol{)} \text{, where }\rho = \sum_{x} p_x \rho_x \end{equation}
\end{holevo}
$H(X:Y)$ denotes mutual information between $X$ and $Y$, defined as \emph{the amount of ignorance about Y that is reduced due to knowing about X}. When $X$ and $Y$ are independent, they have zero mutual information, and when $X$ and $Y$ are identical, $H(X:Y) = H(X) = H(Y)$. In our case, we want X and Y to be identical. Because of exactness requirement, \begin{equation} \label{eq:4} S(\rho_t) \ge p_t(0)S(\rho_{t_0}) + p_t(1)S(\rho_{t_1}) + H(B) \end{equation} for any $t$. We will show by induction that $S(\rho_t) \ge \log_2{row_t}$.
\begin{itemize}
\item Basis step: $S(\rho_t) \ge 0$ for any completely chosen $\rho$, since von Neumann entropy is always nonnegative.
\item Inductive step: Start by modifying (\ref{eq:4}).
\begin{align}
S(\rho_t) & \ge p_t(0)\underbrace{\log_2{row_{t_0}}}_{\le S(\rho_{t_0})} + p_t(1)\underbrace{\log_2{row_{t_1}}}_{\le S(\rho_{t_1})} + H(B) \\
& \label{eq6} \ge p_t(0)\log_2{\lbrack p_t(0) row_t \rbrack} + p_t(1)\log_2{\lbrack p_t(1)row_t \rbrack} \\& \hspace{10mm} - \nonumber p_t(0)\log_2{ p_t(0)} - p_t(1)\log_2{p_t(1)}
\end{align}
First two terms in (\ref{eq6}) can be reorganized as $p_t(i)\lbrack \log_2{\lbrack p_t(i)row_t \rbrack}\rbrack \rightarrow p_t(i)\lbrack log_2{p_t(i)} + \log_2{row_t} \rbrack $ so that the negative contributors from $H(B)$ are negated.
\begin{align}
S(\rho_t) & \ge \lbrack p_t(0) + p_t(1) \rbrack \log_2{row_t} = \log_2{row_t} \hspace{14mm}
\end{align}
\end{itemize}
By induction, we proved $S(\rho_t) \ge \log_2{row_t}$. Selecting $t = \epsilon$, we find $S(\rho_\epsilon) \ge log_2{row_\epsilon} = log_2{d}$, which is same result we obtained in DFA case. Thus, we can conclude that \textbf{when the QFA is required to work with zero error, it has no state advantage over DFA for recognizing a regular language.}
\end{document}