\documentclass{article}
\input{6897-preamble}
\usepackage{clrscode}

\begin{document}
\psetnum{1}
\date{2004/02/05}

\begin{pset}
  Let $G$ be the graph with $cn$ vertices and edge set $\set{h_1(x),
    h_2(x) | x \in S}$ for uniformly random hash functions $h_1$, $h_2$.

  \begin{lemma}
    With probability at least $\nicefrac{1}{2}$, $G$ contains no cycles.
  \end{lemma}

  \begin{proof}
    Let $C_t$ be the event that a sequence of length $t$ is a cycle.
    We provide an upper bound on the number of such cycles. There are
    $\binom{cn}{t}$ choices for the vertices, and $\frac{t!}{t} =
    (t-1)!$ non-isomorphic orderings of the vertices into a cycle.
    Since the cycle contains $t-1$ edges, there are also
    $n(n-1)\cdots(n-t+1)$ choices of which values in $S$ map to the
    associated hash values. As in the two-cycle-case analysis, the
    probability of any configuration occurring is $2^t
    (cn)^{-2t}$. Thus,
    \[\Pr{C_t} \le \frac{\binom{cn}{t} (t-1)! n(n-1)\cdots(n-t+1)
      2^t}{(cn)^{2t}} \]
    We use the (rough) upper bounds of $(cn)^t$ for $\binom{cn}{t}
    (t-1)!$ and $n^t$ for $n(n-1)\cdots(n-t+1)$.  Applying these and the union
    bound, the probability of having at least one cycle is
    \[ \Pr{C} \le \Pr{\bigcup_{t=2}^\infty C_t} \le \sum_{t=2}^\infty
    \Pr{C_t} \le \sum_{t=2}^\infty \frac{(cn)^t n^t 2^t}{(cn)^{2t}} =
    \sum_{t=2}^\infty \left(\frac{2}{c}\right)^t = \frac{\frac{4}{c^2}}{
      1 - \frac{2}{c}} = \frac{4}{c(c-2)} \]
    which is less than $\nicefrac{1}{2}$ for $c > 4$. This proves the lemma.    
  \end{proof}

  We use this lemma to construct a static Bloomier filter data
  structure. We construct a table two tables $T_1$ and $T_2$, each
  with $4n$ cells of $r$ bits that have the property that for each $x
  \in S$, $f(x) = T_1[h_1(x)] \oplus T_2[h_2(x)]$. Then a
  \proc{Lookup} operation requires only two table accesses and a XOR
  operation.

  To construct the data structure, we first select a pair of hash
  functions $h_1,h_2$ such that the associated graph $G$ has no
  cycles. We accomplish this by selecting a pair of random hash
  functions, hashing each of the elements, constructing the graph, and
  verifying (via DFS) that it has no cycles. If it fails, we select a
  new pair and rehash. This requires polynomial time. By the lemma
  above, the expected number of rehashes is only 2.

  The remainder of the construction is similar to that of cuckoo
  hashing. For each element $x$, if $T_1[h_1(x)]$ is empty, we insert
  $f(x) \oplus T_2[h_2(x)]$ into $T_1[h_1(x)]$. If this cell is
  occupied, we evict the current occupant (call it $y$) and set
  $T_1[h_1(x)]$ anyway, replacing $T_2[h_2(y)]$ with the new value of
  $T_1[h_1(y) \oplus f(y)$. We recursively perform this eviction
  procedure, changing values to maintain the XOR invariant, until we
  reach an unoccupied cell. This is guaranteed to happen, since we
  chose the hash functions so that there would be no cycles. Expected
  construction time is polynomial in $n$.
\end{pset}

\end{document}
