\documentclass[pdf,ps,corners]{prosper}
\usepackage{clrscode}
\usepackage{ifthen,keyval}
\usepackage{amsmath,amscd}
\usepackage{graphicx}

% Algorithmic stuff
\newcommand{\bigO}[1]{\ensuremath{O\left(#1\right)}}
\newcommand{\littleO}[1]{\ensuremath{o\left(#1\right)}}
\newcommand{\bigTheta}[1]{\ensuremath{\Theta\left(#1\right)}}
\newcommand{\bigOmega}[1]{\ensuremath{\Omega\left(#1\right)}}
\newcommand{\littleOmega}[1]{\ensuremath{\omega\left(#1\right)}}


% \Logo{\includegraphics{clipart/csail-logo-notext}}
\newcommand{\csaillogo}{\includegraphics[scale=0.25]{clipart/csail-logo-notext}}
\Logo{\csaillogo}
\renewcommand{\FrameWithCorners}[1]{\PutLogo{#1}}
% Useful prosper intro: http://freshmeat.net/articles/view/667/

% Collection of solutions or improvements for general problems which
% together form a coherent content sharing system

\graphicspath{{mp/}{clipart/}}

%
% Problem/solution formatting commands
%
\newcommand{\problem}[1]{\centering \textbf{Problem:} #1}

%
% Clipart helpers
%
\newcommand{\client}[2][client]{\raisebox{-.75em}{\includegraphics{#1}}#2}
\newcommand{\tinyclient}[2][client]{\raisebox{-.2em}
  {\includegraphics[scale=.5]{#1}}#2}

%
% Title slide setup
%
\title{Arpeggio:}
\subtitle{Metadata~Searching and Content~Sharing
  in~Structured~Peer-to-Peer~Networks}
\author{Dan R. K. Ports}
\institution{MIT Computer Science and Artificial Intelligence
  Laboratory\\Joint work with Austin T. Clements and David R. Karger}
\email{drkp@mit.edu}
\slideCaption{Arpeggio}

%
% Programmatic slides
%
\newcommand{\sectionslide}[1]{%
  \def\fmt##1##2{\ifthenelse{#1 = ##1}{\textbf{##2}}{##2}}
  \begin{slide}{Outline}
    \begin{itemize}
    \item \fmt{1}{Overview}
    \item \fmt{2}{Searching}
    \item \fmt{3}{Index Gateways}
    \item \fmt{4}{Content Distribution}
    \item \fmt{5}{Conclusion}
    \end{itemize}
  \end{slide}}

%
% Slides
%
\begin{document}

\maketitle

\sectionslide{1} % Overview

% Let's not even pretend to talk about related work. (It's not like
% any of it is important, right? In the beginning, the universe was
% without form and void, and then Arpeggio came along.)


% XXX Still working here. Rearrange/combine these four slides in
% interesting ways.
\begin{slide}{The Content-Sharing Problem}
  \begin{itemize}
  \item Goals
    \begin{itemize}
      \item Find files matching a search query
      \item Identify sources for a file
      \item Want full decentralization
    \end{itemize}

  \item Assumptions
    \begin{itemize}
      \item Only searching \emph{metadata}
      \item Metadata is small (compared to actual data)
      \item Highly dynamic and unstable network topology, content, and
        sources
    \end{itemize}
  \end{itemize}
\end{slide}

\begin{slide}{P2P Evolution}
  \begin{itemize}
  \item Centralized index systems (Napster)
    \begin{itemize}
    \item Content shared peer-to-peer, but index centralized
    \item Scale poorly, single point of failure
    \end{itemize}
  \item Unstructured overlay networks (Gnutella)
    \begin{itemize}
    \item Queries by flooding or random walks
    \item Large network impact
    \item Sacrifices perfect recall
    \end{itemize}
  \end{itemize}
\end{slide}

\begin{slide}{DHTs to the Rescue?}
  \begin{itemize}
  \item Useful building block: \emph{distributed hash tables} (DHTs)
    \begin{itemize}
    \item e.g. \textbf{Chord}, Pastry, Tapestry, Kademlia, ...
    \end{itemize}
    
  \item \proc{Lookup} abstraction: locate peer responsible for key
  \item \proc{Get-Block}/\proc{Put-Block} hash table operations
    \vspace{1em}
  \item Efficient: $\bigO{\log n}$ messages per lookup
  \item Scalable: $\bigO{\log n}$ routing state per node
  \item Robust: maintains correctness despite node joins \& failures
  \end{itemize}
\end{slide}

\overlays{7} {
  \begin{slide}{Building a Distributed Hash Table}
    \begin{itemize}
    \item
      \onlySlide*{1}{
        \textbf{Consistent hashing:} which node stores an object?
      }
      \onlySlide*{2}{
        Map nodes and objects to same circular keyspace
      }
      \onlySlide*{3}{
        Successor node is responsible for each key
      }
      \onlySlide*{4}{
        Pointers to successors --- slow routing possible
      }
      \onlySlide*{5}{
        Exponentially spaced \emph{fingers} --- $\bigO{\log n}$ hop routing\hspace{-1em}
      }
      \onlySlide*{6}{
        N8: \proc{Lookup}(K192) --- N156 is closest finger
      }
      \onlySlide*{7}{
        N156 knows N205 is responsible, forwards message
      }
      
    \end{itemize}
    
    \begin{center}
      \onlySlide*{2}{
        \includegraphics[scale=0.35]{mpchord/chash2}
      }      
      \onlySlide*{3}{
        \includegraphics[scale=0.35]{mpchord/chash2}
      }      
      \onlySlide*{4}{
        \includegraphics[scale=0.35]{mpchord/successors}
      }      
      \onlySlide*{5}{
        \includegraphics[scale=0.35]{mpchord/fingers1}
      }      
      \onlySlide*{6}{
        \includegraphics[scale=0.35]{mpchord/fingers1}
      }
      \onlySlide*{7}{
        \includegraphics[scale=0.35]{mpchord/fingers2}
      }      
    \end{center}
  \end{slide}
}

\begin{slide}{DHTs for File Sharing?}
  \begin{itemize}
  \item Can we just store everything in a DHT?
    \begin{itemize}
    \item DHT is great for \emph{lookup by name},\\but usually we
      don't know the filename!
    \item Need complex \emph{search by keyword} queries instead
    \end{itemize}
    \vspace{1em}
  \item \textbf{Arpeggio:} indexing based on DHT techniques
    \begin{itemize}
    \item Uses Chord \proc{Lookup} primitive
    \item Maintains a \emph{distributed keyword-set index}
    \item Indirect storage of file data
    \end{itemize}
  \end{itemize}
\end{slide}
% \begin{slide}{Overall Architecture}
%   \begin{itemize}
%   \item Search -- Distributed keyword-set index
%   %\item Index Maintenance --
%   \item Content Distribution -- Indirect storage
%   \end{itemize}
%   % XXX
% \end{slide}

\sectionslide{2} % Searching

\def\icon#1{%
  \raisebox{-.3em}{\includegraphics[height=1.3em]{#1}}
}
\def\icona{\icon{debian}}
\def\iconb{\icon{tux}}
\def\iconc{\icon{freebsd}}

\makeatletter
\newbox\ms@spacebox
\def\maybeshow#1#2{%
  \ifthenelse{\equal{#1}{t}}%
    {#2}%
    {\setbox\ms@spacebox\hbox{#2}\hspace{\wd\ms@spacebox}}
}

\define@key{showrecords}{abig}[true]{%
  \begin{tabular}{|ll|}
    \hline
    \multicolumn{2}{|l|}{(debian, disk1, iso) \hfill \icona} \\
    \hline
    name & Debian Disk1.iso \\
    file ID & cdb79ca3db1f39b1940ed5... % 8aa98da2e7
    \\
    size & 586MB \\
    type & application/x-iso9660-image \\
    \multicolumn{1}{|c}{$\vdots$} & \\
    \hline
  \end{tabular}
}
\define@key{showrecords}{a}[true]{%
  \begin{tiny}
    \begin{tabular}{|ll|}
      \hline
      \multicolumn{2}{|l|}{(debian, disk1, iso) \hfill \icona}
      \\
      \hline
      name & Debian Disk1.iso \\
      file ID & cdb79ca3db1f3... % 9b1940ed58aa98da2e7
      \\
      \hline \multicolumn{2}{l}{}
    \end{tabular}
  \end{tiny}
}
\define@key{showrecords}{b}[true]{%
  \begin{tiny}
    \begin{tabular}{|ll|}
      \hline
      \multicolumn{2}{|l|}{(debian, disk2, iso) \hfill \iconb}
      \\
      \hline
      name & Debian Disk2.iso \\
      file ID & 5ccbf54d7e502... % 0a196353569fdbb0e5c
      \\
      \hline \multicolumn{2}{l}{}
    \end{tabular}
  \end{tiny}
}
\define@key{showrecords}{c}[true]{%
  \begin{tiny}
    \begin{tabular}{|ll|}
      \hline
      \multicolumn{2}{|l|}{(disk1, freebsd, iso) \hfill \iconc}
      \\
      \hline
      name & Freebsd Disk1.iso \\
      file ID & fbcbfdff31f27de... % 396f257e0a37a78b8
      \\
      \hline \multicolumn{2}{l}{} \\
    \end{tabular}
  \end{tiny}
}
\def\showrecords[#1]{\setkeys{showrecords}{#1}}
\makeatother

\begin{slide}{Index Entries}
  % First, let's examine what we're searching _for_.
  % * Example index entry
  % * Contains metadata
  % * Most is essentially payload
  % ** Including obtaining info
  % * Keywords from filename, but could be more complex
  \begin{center}
    \showrecords[abig]
  \end{center}
\end{slide}

\bgroup
\def\showarec{1}
\def\showa{2}
\def\showb{3}
\def\showc{4}
\def\showquerier{5}
\def\showquery{6}
\def\showboom{7}
\overlays{\showboom}{
  \begin{slide}{Distributed Inverted Indexing}
    % Now, a typical way of doing this search would be with
    % distributed inverted indexing, which looks something like this.
    % * Find nodes responsible for keywords via Chord
    \begin{minipage}{2in}
      \fromSlide{\showarec}{\showrecords[a]}
      \fromSlide{\showb}{\showrecords[b]}
      \fromSlide{\showc}{\showrecords[c]}
    \end{minipage}
    \hfill
    \begin{minipage}{2in}
      \begin{tabular}{rlll}
        \fromSlide{\showa}{%
          \fromSlide*{\showquery}{\tinyclient{\textbf{debian}}}%
          \untilSlide*{\showquerier}{\tinyclient{debian}}}
        & \fromSlide*{\showquery}
        {\psframebox[framesep=0pt,linecolor=red]
          {\fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}}}%
        \untilSlide*{\showquerier}
        {\psframebox[framesep=0pt,linecolor=white]
          {\fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}}} \\
        \fromSlide{\showa}{%
          \fromSlide*{\showquery}{\tinyclient{\textbf{disk1}}}%
          \untilSlide*{\showquerier}{\tinyclient{disk1}}}
        &  \fromSlide*{\showquery}
        {\psframebox[framesep=0pt,linecolor=red]
          {\fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc}}}%
        \untilSlide*{\showquerier}
        {\psframebox[framesep=0pt,linecolor=white]
          {\fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc}}} \\
        \fromSlide{\showa}{\tinyclient{iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}
        \fromSlide{\showc}{\iconc} \\
        \fromSlide{\showb}{\tinyclient{disk2}}
        & \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showc}{\tinyclient{freebsd}}
        & \fromSlide{\showc}{\iconc} \\
      \end{tabular}

      \FromSlide{\showquerier}
      \begin{center}
        \untilSlide*{\showquery}{%
          \client[client-notflaming]{``debian disk1?''}}%
        \fromSlide*{\showboom}{%
          \client[client-flaming]{``debian disk1?''}} \\
        \fromSlide{\showquery}{%
          \{\icona, \iconb\} $\cap$ \{\icona, \iconc\}}
      \end{center}
    \end{minipage}

    \FromSlide{\showboom}
    \problem{Network traffic for large indexes}
  \end{slide}
}
\egroup

% \begin{slide}{Distributed Indexing}
%   \begin{itemize}
%   \item Inverted index: DHT maps keywords to file IDs
%   \item To lookup ``a b'':
%     % XXX Real pseudo-code?
%     \begin{itemize}
%     \item \proc{Get} indexes for ``a'' and ``b''
%     \item Client performs intersection
%     \end{itemize}
%   \item \problem{wasteful of bandwidth -- transfers entire index}
%   \end{itemize}
% \end{slide}

\bgroup
\def\showquerier{3}
\def\showquery{4}
\def\showboom{5}
\overlays{\showboom}{
  % One approach to this problem is to add network-side processing
  % * ...  Now you're all probably thinking that this is going to make
  %   the whole Internet catch on fire, but I'm going to try to
  %   convince you otherwise
  \begin{slide}{Index-Side Filtering}
    \onlySlide*{1}{%
      \begin{itemize}
      \item Keywords are small, so store keywords in index
      \item Pick one index node
      \item Send full query
      \item Index node performs filtering and returns only relevant
        results
      \item Can also include other \emph{filterable metadata}, e.g. file
        size, MP3 bitrate, etc.
      \end{itemize}
    }
    \FromSlide{2}
    \begin{minipage}{2in}
      \showrecords[a,b,c]
    \end{minipage}
    \hfill
    \begin{minipage}{2in}
      \begin{tabular}{rlll}
        \untilSlide*{\showquerier}{\tinyclient[client-notflaming]{debian}}%
        \onlySlide*{\showquery}{\tinyclient[client-notflaming]{\textbf{debian}}}%
        \fromSlide*{\showboom}{\tinyclient[client-flaming]{\textbf{debian}}}
        & \fromSlide*{\showquery}
        {\psframebox[framesep=0pt,linecolor=red]{\icona} \iconb}%
        \untilSlide*{\showquerier}
        {\psframebox[framesep=0pt,linecolor=white]{\icona} \iconb} \\
        \tinyclient{disk1} & \icona \iconc \\
        \tinyclient{iso} & \icona \iconb \iconc \\
        \tinyclient{disk2} & \iconb \\
        \tinyclient{freebsd} & \iconc \\
      \end{tabular}

      \FromSlide{\showquerier}
      \begin{center}
        \client[client-notflaming]{``debian disk1?''} \\
        \fromSlide{\showquery}{\{\icona\}}
      \end{center}
    \end{minipage}

    \FromSlide{\showboom}
    \problem{Poor query load-balancing}
  \end{slide}
}
\egroup

% \begin{slide}{Index-Side Filtering}
%   \begin{itemize}
%   \item Metadata is small, so store \emph{entire} metadata in index
%   \item Send full query to index node
%   \item Index node performs filtering and returns only relevant
%     results
%   \item Can also include other \emph{filterable metadata}, e.g. file
%     size, MP3 bitrate, etc.
%   \end{itemize}
%   \problem{index nodes for popular keywords are overloaded}
% \end{slide}

\bgroup
\def\showa{2}
\def\showb{3}
\def\showc{4}
\def\showquerier{5}
\def\showquery{6}
\overlays{\showquery}{
  % One approach to this problem is to add network-side processing
  % * ...  Now you're all probably thinking that this is going to make
  %   the whole Internet catch on fire, but I'm going to try to
  %   convince you otherwise
  \begin{slide}{Keyword-Set Indexing}
    \onlySlide*{1}{%
      \begin{itemize}
      \item Build index on \emph{keyword sets} rather than keywords
      \item Store subsets of size $\le K$
      \item More keyword-set indexes, but each is shorter
      \item Single-keyword indexes are less important, so can be
        truncated
        \begin{itemize}
        \item <~29\% of web searches have only 1 keyword. [Reynolds \&
          Vahdat 2003]
        \end{itemize}
      \item To search: send filtered query to any $K$-size subset
        index
      \end{itemize}
    }
    \FromSlide{2}
    \vspace{-2em}
    \begin{minipage}{2in}
      \fromSlide{\showa}{\showrecords[a]}
      \fromSlide{\showb}{\showrecords[b]}
      \fromSlide{\showc}{\showrecords[c]}
    \end{minipage}
    \hfill
    \begin{minipage}{2in}
      \begin{tabular}{rlll}
        ($K = 2$) & \\
        \fromSlide{\showa}{\tinyclient{debian}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showa}{\tinyclient{disk1}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc} \\
        \fromSlide{\showa}{\tinyclient{iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}
        \fromSlide{\showc}{\iconc} \\
        \fromSlide*{\showa}{%
          \untilSlide*{\showquerier}{\tinyclient{debian disk1}}%
          \fromSlide*{\showquery}{\tinyclient{\textbf{debian disk1}}}%
        }%
        & \fromSlide{\showa}{%
          \untilSlide*{\showquerier}
          {\psframebox[framesep=0pt,linecolor=white]{\icona}}%
          \fromSlide*{\showquery}
          {\psframebox[framesep=0pt,linecolor=red]{\icona}}%
        } \\
        \fromSlide{\showa}{\tinyclient{debian iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showa}{\tinyclient{disk1 iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc} \\
        \fromSlide{\showb}{\tinyclient{disk2}}
        & \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showb}{\tinyclient{debian disk2}}
        & \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showb}{\tinyclient{disk2 iso}}
        & \fromSlide{\showb}{\iconb} \\
        % Bork!
        \untilSlide{0}{\tinyclient{\textbf{debian disk1}}}%
        \fromSlide{\showc}{$\vdots$}
      \end{tabular}
    
      \FromSlide{\showquerier}
      \begin{center}
        \client{``debian disk1?''}
      \end{center}
    \end{minipage}
  \end{slide}
}
\egroup

% \begin{slide}{Keyword-Set Indexing}
%   \begin{itemize}
%   \item Build index on \emph{keyword sets} rather than keywords
%   \item Store metadata block in index for each subset of size $\le K$
%   \item More keyword-set indexes, but each is shorter
%   \item Single-keyword indexes are less important, so can be truncated
%     \begin{itemize}
%     \item <~29\% of web searches have only 1 keyword. [Reynolds \&
%       Vahdat 2003]
%     \end{itemize}
%   \item To search: send filtered query to any $K$-size subset index
%   \end{itemize}
% \end{slide}

\newcommand{\Inastiness}{%
    \sum_{i = 1}^K \binom{m}{i} =
    \begin{cases}
      2^m - 1 & \text{if $m \le K$} \\
      O(m^K) & \text{if $m > K$}
    \end{cases}%
}
\overlays{3}{%
  \begin{slide}{Indexing Cost}
    \onlySlide*{1}{%
      \bgroup
      \setlength{\abovedisplayskip}{0pt}
      \begin{align*}
        m & = \text{metadata keywords} \\
        K & = \text{maximum subset size parameter} \\
        I(m) & = \text{index entries} \\
        & = \Inastiness{}
      \end{align*}
      \egroup
    }
    \FromSlide{2}
    \bgroup
    \setlength{\abovedisplayskip}{0pt}
    \begin{align*}
      I(m) & = \Inastiness{}
    \end{align*}
    \egroup
    \onlySlide*{2}{%
      \begin{center}
        \input{figures/egraph2d.tex}
      \end{center}}
    \onlySlide*{3}{%
      \begin{center}
        For files with many metadata keywords,\\ $I(m)$ is polynomial in
        $m$.
      \end{center}}
  \end{slide}
}

\begin{slide}{Storage Costs (FreeDB)}
  \begin{center}
    \begin{tabular}{lr}
%      Number of discs & 1,582,836 \\
      Number of songs & 21,195,244 \\
      Total index entries ($K=1$) & 134,403,379 \\
      Index entries per song ($K=1$) & 6.274406 \\
      \hline
      Total index entries ($K=3$) & 1,494,688,373 \\
      Index entries per song ($K=3$) & 66.078093
    \end{tabular} \\
    \vspace{1em}
    $\Rightarrow$ Total storage cost only an order of magnitude more
    than required for $K=1$ inverted index.
    % XXX Compare against size of music data (deduce from track
    % lengths)
  \end{center}
\end{slide}

\begin{slide}{Choosing $K$}
  \begin{center}
    \begin{itemize}
    \item Larger $K$ improves query load distribution, increases
      indexing costs
    \end{itemize}
    \begin{center}
      \input{figures/index-size-increase-vs-k}
    \end{center}
    \vspace{-1em}
    \begin{center}
      For web searches: average query length 2.53
    \end{center}
  \end{center}
\end{slide}

\begin{slide}{Index Maintenance}
  \begin{itemize}
  \item Expiration
    \begin{itemize}
    \item File availability constantly changes as peers join/leave
    \item Expiration rather than polling
    \end{itemize}
  \item Replication
    \begin{itemize}
    \item Replication instead of erasure coding
    \item Only weak consistency required
    \item Can periodically propagate updates in batch
    \end{itemize}
  \end{itemize}
  % XXX Is there more to say?
\end{slide}


\sectionslide{3} % Index Gateways

% Moved: Metadata Expiration

% XXX Mention expiration offhand somewhere else?

\overlays{6}{
  \begin{slide}{Index Gateways}
    \untilSlide*{3}{
      \begin{center}
        \onlySlide*{1}{\includegraphics{clipart/gateway1}}%
        \onlySlide*{2}{\includegraphics{clipart/gateway2}}%
        \onlySlide*{3}{\includegraphics{clipart/gateway3}}%
      \end{center}
      \begin{itemize}
      \item %
        \onlySlide*{1}{Each file has one metadata block, stored in
          $I(m)$ indexes.}%
        \onlySlide*{2}{Each peer sharing the file will insert the
          \emph{same} metadata block into each index.}%
        \onlySlide*{3}{Total insertion cost for $m$ metadata keywords
          and $s$ source peers: $sI(m)$ messages.}
      \end{itemize}

      \onlySlide*{3}{
        \begin{center}
          \problem{expensive and redundant}    
        \end{center}
      }
    }
    \FromSlide{4}
    \begin{itemize}
    \item Solution: aggregate updates at an \emph{index gateway}
    \item Receives metadata blocks from sources\\ and sends to indexes
      only when necessary
    \end{itemize}
    \begin{center}
      \onlySlide*{4}{\includegraphics{clipart/gateway4}}%
      \onlySlide*{5}{\includegraphics{clipart/gateway5}}%
      \onlySlide*{6}{\includegraphics{clipart/gateway6}}
      
      Insertion cost is now $s + I(m)$ (vs. $s I(m)$)!
    \end{center}
  \end{slide}
}

% Moved: Index Replication

\sectionslide{4} % Content Distribution

\begin{slide}{Direct Storage?}
  % * ...  So how do we get from file IDs to sources?
  \begin{itemize}
  \item Content is large
  \item Network has churn
    \begin{itemize}
    \item Kazaa median session length 2.4 minutes [Gummadi et
      al. 2003]
    \end{itemize}
  \end{itemize}
  \problem{DHT storage of content is impractical}
\end{slide}
\begin{slide}{Indirect Storage}
  \begin{itemize}
  \item Add indirection
  \item Store only small pointers in the network
  \end{itemize}
  \begin{align*}
    % XXX Do this sanely and remove amscd package
    \begin{CD}
      \text{keywords} \\
      @VV{\text{keyword-set index search}}V \\
      \text{file IDs} \\
      @VV{\text{}}V \\
      \text{sources}
    \end{CD}
  \end{align*}
\end{slide}

% Moved: Segmentation

\begin{slide}{Content-Sharing Subrings}
  \begin{itemize}
  \item How to identify sources for a file?
  \item Simple solution: ``tracker node''
  \item Instead, file sources form subrings
    \begin{itemize}
    \item To find source, \proc{Lookup} random ID in file's subring
    \item Search and maintenance costs same as with tracker, but
      distributed over network
    \item No single point of failure
    \end{itemize}
  \end{itemize}
  \vspace{-1em}
%   \begin{align*}
%     % XXX Do this sanely and remove amscd package
%     \begin{CD}
%       \text{keywords} \\
%       @VV{\text{keyword-set index search}}V \\
%       \text{file IDs} \\
%       @VV{\text{standard DHT lookup}}V \\
%       \text{chunk IDs} \\
%       @VV{\text{content-sharing subring}}V \\
%       \text{sources}
%     \end{CD}
%   \end{align*}
\end{slide}

% Moved: Postfetching

\begin{slide}{Postfetching}
  \begin{itemize}
  \item Indirect storage leads to unavailable content
  \item \emph{Postfetching:} actively increase the availability\\ of
    rare but popular files
    \begin{itemize}
    \item Delay metadata expiration to track request\\ for unavailable content
    \item Requesting node inserts \emph{request block}
    \item Source node later rejoins and locates\\ request block
    \item Content replicated on caches on random nodes
    \end{itemize}    
  \end{itemize}
\end{slide}

\sectionslide{5} % Conclusion

\overlays{2}{
  \begin{slide}{Conclusion}
    \begin{itemize}
    \item Supports search with distributed keyword-set index
    \item Extends standard DHT interface with\\ network-side processing
      \begin{itemize}
      \item Index-side filtering
      \item Index gateways
      \end{itemize}
    \item Content distribution with indirect storage
      \begin{itemize}
      \item Indexing via subrings
      \item Postfetching to increase availability
      \end{itemize}
      % \item The future is here!
      \onlySlide{2}{
        \vspace{1em}
      \item \textbf{Questions?}
      }
    \end{itemize}
  \end{slide}
}
\begin{slide}{}
\end{slide}

\begin{slide}{Bonus Section}
\end{slide}

\overlays{3}{
  \begin{slide}{Bonus 1 -- Segmentation}
    \onlySlide*{1}{%
      \begin{itemize}
      \item At this level, content is atomic
      \item Can't share partially downloaded content
      \end{itemize}
      \problem{Doesn't utilize upload bandwidth}
    }
    \onlySlide*{2}{%
      \begin{itemize}
      \item Split content into chunks and share on the chunk level
      \item Typical file sharing networks contain many similar files
      \end{itemize}
      \problem{Underutilization of content sources}
    }
    \onlySlide*{3}{%
      \begin{itemize}
      \item Place chunk boundaries based on content data
      \item Identify chunks by hash of their contents
      \item Files are now just a list of their constituent chunk IDs
      \end{itemize}
      \vspace{-1em}
      \begin{tiny}
        \begin{align*}
          % XXX Do this sanely and remove amscd package
          \begin{CD}
            \text{keywords} \\
            @VV{\text{keyword-set index search}}V \\
            \text{file IDs} \\
            @VV{\text{standard DHT lookup}}V \\
            \text{chunk IDs} \\
            @VV{\text{content-sharing subring}}V \\
            \text{sources}
          \end{CD}
        \end{align*}
      \end{tiny}
    }
  \end{slide}
}

\end{document}

% LocalWords:  Metadata metadata
