\documentclass[pdf,ps,corners]{prosper}
\usepackage{clrscode}
\usepackage{ifthen,keyval}
\usepackage{amsmath,amscd}
\usepackage{graphicx}

% \Logo{\includegraphics{clipart/csail-logo-notext}}
\newcommand{\csaillogo}{\includegraphics[scale=0.25]{clipart/csail-logo-notext}}
\Logo{\csaillogo}
\renewcommand{\FrameWithCorners}[1]{\PutLogo{#1}}
% Useful prosper intro: http://freshmeat.net/articles/view/667/

% Collection of solutions or improvements for general problems which
% together form a coherent content sharing system

\graphicspath{{mp/}{clipart/}}

%
% Problem/solution formatting commands
%
\newcommand{\problem}[1]{\centering \textbf{Problem:} #1}

%
% Clipart helpers
%
\newcommand{\client}[2][client]{\raisebox{-.75em}{\includegraphics{#1}}#2}
\newcommand{\tinyclient}[2][client]{\raisebox{-.2em}
  {\includegraphics[scale=.5]{#1}}#2}

%
% Title slide setup
%
\title{Arpeggio:}
\subtitle{Metadata~Searching and Content~Sharing with~Chord}
\author{Austin T. Clements, Dan R. K. Ports, David R. Karger}
\institution{MIT Computer Science and Artificial Intelligence
  Laboratory}
\email{\{aclements, drkp, karger\}@mit.edu}
\slideCaption{Arpeggio}

%
% Programmatic slides
%
\newcommand{\sectionslide}[1]{%
  \def\fmt##1##2{\ifthenelse{#1 = ##1}{\textbf{##2}}{##2}}
  \begin{slide}{Outline}
    \begin{itemize}
    \item \fmt{1}{Overview}
    \item \fmt{2}{Searching}
    \item \fmt{3}{Index Gateways}
    \item \fmt{4}{Content Distribution}
    \item \fmt{5}{Conclusion}
    \end{itemize}
  \end{slide}}

%
% Slides
%
\begin{document}

\maketitle

\sectionslide{1} % Overview

% Let's not even pretend to talk about related work. (It's not like
% any of it is important, right? In the beginning, the universe was
% without form and void, and then Arpeggio came along.)

% \begin{slide}{P2P Evolution}
%   \begin{itemize}
%   \item Objectives:
%     \begin{itemize}
%     \item Search for files matching a query
%     \item Identify nodes sharing a file
%     \end{itemize}
%   \item Centralized index systems (Napster)
%     \begin{itemize}
%     \item Scale poorly, single point of failure
%     \end{itemize}
%   \item Unstructured overlay networks (Gnutella)
%     \begin{itemize}
%     \item Queries by flooding or random walks
%     \item Large network impact
%     \item Sacrifices perfect recall
%     \end{itemize}
%   \end{itemize}
% \end{slide}

% XXX Still working here. Rearrange/combine these four slides in
% interesting ways.
\begin{slide}{The Content-Sharing Problem}
  \begin{itemize}
  \item Goals
    \begin{itemize}
      \item Find files matching a search query
      \item Identify sources for a file
      \item Want full decentralization
    \end{itemize}

  \item Assumptions
    \begin{itemize}
      \item Only searching \emph{metadata}
      \item Metadata is small (compared to actual data)
      \item Highly dynamic and unstable network topology, content, and
        sources
    \end{itemize}
  \end{itemize}
\end{slide}

\begin{slide}{DHTs (Almost) to the Rescue}
  \begin{itemize}
  \item Great for lookup-by-name
  \item Insufficient for efficient search-by-content
  \item Powerful underlying \proc{Lookup} abstraction
  \end{itemize}
  \begin{center}
    \includegraphics[width=1.5in]{chordring}
  \end{center}
\end{slide}

% \begin{slide}{Overall Architecture}
%   \begin{itemize}
%   \item Search -- Distributed keyword-set index
%   %\item Index Maintenance --
%   \item Content Distribution -- Indirect storage
%   \end{itemize}
%   % XXX
% \end{slide}

\sectionslide{2} % Searching

\def\icon#1{%
  \raisebox{-.3em}{\includegraphics[height=1.3em]{#1}}
}
\def\icona{\icon{debian}}
\def\iconb{\icon{tux}}
\def\iconc{\icon{freebsd}}

\makeatletter
\newbox\ms@spacebox
\def\maybeshow#1#2{%
  \ifthenelse{\equal{#1}{t}}%
    {#2}%
    {\setbox\ms@spacebox\hbox{#2}\hspace{\wd\ms@spacebox}}
}

\define@key{showrecords}{abig}[true]{%
  \begin{tabular}{|ll|}
    \hline
    \multicolumn{2}{|l|}{(debian, disk1, iso) \hfill \icona} \\
    \hline
    name & Debian Disk1.iso \\
    file ID & cdb79ca3db1f39b1940ed5... % 8aa98da2e7
    \\
    size & 586MB \\
    type & application/x-iso9660-image \\
    \multicolumn{1}{|c}{$\vdots$} & \\
    \hline
  \end{tabular}
}
\define@key{showrecords}{a}[true]{%
  \begin{tiny}
    \begin{tabular}{|ll|}
      \hline
      \multicolumn{2}{|l|}{(debian, disk1, iso) \hfill \icona}
      \\
      \hline
      name & Debian Disk1.iso \\
      file ID & cdb79ca3db1f3... % 9b1940ed58aa98da2e7
      \\
      \hline \multicolumn{2}{l}{}
    \end{tabular}
  \end{tiny}
}
\define@key{showrecords}{b}[true]{%
  \begin{tiny}
    \begin{tabular}{|ll|}
      \hline
      \multicolumn{2}{|l|}{(debian, disk2, iso) \hfill \iconb}
      \\
      \hline
      name & Debian Disk2.iso \\
      file ID & 5ccbf54d7e502... % 0a196353569fdbb0e5c
      \\
      \hline \multicolumn{2}{l}{}
    \end{tabular}
  \end{tiny}
}
\define@key{showrecords}{c}[true]{%
  \begin{tiny}
    \begin{tabular}{|ll|}
      \hline
      \multicolumn{2}{|l|}{(disk1, freebsd, iso) \hfill \iconc}
      \\
      \hline
      name & Freebsd Disk1.iso \\
      file ID & fbcbfdff31f27de... % 396f257e0a37a78b8
      \\
      \hline \multicolumn{2}{l}{} \\
    \end{tabular}
  \end{tiny}
}
\def\showrecords[#1]{\setkeys{showrecords}{#1}}
\makeatother

\begin{slide}{Index Entries}
  % First, let's examine what we're searching _for_.
  % * Example index entry
  % * Contains metadata
  % * Most is essentially payload
  % ** Including obtaining info
  % * Keywords from filename, but could be more complex
  \begin{center}
    \showrecords[abig]
  \end{center}
\end{slide}

\bgroup
\def\showarec{1}
\def\showa{2}
\def\showb{3}
\def\showc{4}
\def\showquerier{5}
\def\showquery{6}
\def\showboom{7}
\overlays{\showboom}{
  \begin{slide}{Distributed Inverted Indexing}
    % Now, a typical way of doing this search would be with
    % distributed inverted indexing, which looks something like this.
    % * Find nodes responsible for keywords via Chord
    \begin{minipage}{2in}
      \fromSlide{\showarec}{\showrecords[a]}
      \fromSlide{\showb}{\showrecords[b]}
      \fromSlide{\showc}{\showrecords[c]}
    \end{minipage}
    \hfill
    \begin{minipage}{2in}
      \begin{tabular}{rlll}
        \fromSlide{\showa}{%
          \fromSlide*{\showquery}{\tinyclient{\textbf{debian}}}%
          \untilSlide*{\showquerier}{\tinyclient{debian}}}
        & \fromSlide*{\showquery}
        {\psframebox[framesep=0pt,linecolor=red]
          {\fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}}}%
        \untilSlide*{\showquerier}
        {\psframebox[framesep=0pt,linecolor=white]
          {\fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}}} \\
        \fromSlide{\showa}{%
          \fromSlide*{\showquery}{\tinyclient{\textbf{disk1}}}%
          \untilSlide*{\showquerier}{\tinyclient{disk1}}}
        &  \fromSlide*{\showquery}
        {\psframebox[framesep=0pt,linecolor=red]
          {\fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc}}}%
        \untilSlide*{\showquerier}
        {\psframebox[framesep=0pt,linecolor=white]
          {\fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc}}} \\
        \fromSlide{\showa}{\tinyclient{iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}
        \fromSlide{\showc}{\iconc} \\
        \fromSlide{\showb}{\tinyclient{disk2}}
        & \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showc}{\tinyclient{freebsd}}
        & \fromSlide{\showc}{\iconc} \\
      \end{tabular}

      \FromSlide{\showquerier}
      \begin{center}
        \untilSlide*{\showquery}{%
          \client[client-notflaming]{``debian disk1?''}}%
        \fromSlide*{\showboom}{%
          \client[client-flaming]{``debian disk1?''}} \\
        \fromSlide{\showquery}{%
          \{\icona, \iconb\} $\cap$ \{\icona, \iconc\}}
      \end{center}
    \end{minipage}

    \FromSlide{\showboom}
    \problem{Network hosage}
  \end{slide}
}
\egroup

% \begin{slide}{Distributed Indexing}
%   \begin{itemize}
%   \item Inverted index: DHT maps keywords to file IDs
%   \item To lookup ``a b'':
%     % XXX Real pseudo-code?
%     \begin{itemize}
%     \item \proc{Get} indexes for ``a'' and ``b''
%     \item Client performs intersection
%     \end{itemize}
%   \item \problem{wasteful of bandwidth -- transfers entire index}
%   \end{itemize}
% \end{slide}

\bgroup
\def\showquerier{3}
\def\showquery{4}
\def\showboom{5}
\overlays{\showboom}{
  % One approach to this problem is to add network-side processing
  % * ...  Now you're all probably thinking that this is going to make
  %   the whole Internet catch on fire, but I'm going to try to
  %   convince you otherwise
  \begin{slide}{Index-Side Filtering}
    \onlySlide*{1}{%
      \begin{itemize}
      \item Keywords are small, so store keywords in index
      \item Pick one index node
      \item Send full query
      \item Index node performs filtering and returns only relevant
        results
      \item Can also include other \emph{filterable metadata}, e.g. file
        size, MP3 bitrate, etc.
      \end{itemize}
    }
    \FromSlide{2}
    \begin{minipage}{2in}
      \showrecords[a,b,c]
    \end{minipage}
    \hfill
    \begin{minipage}{2in}
      \begin{tabular}{rlll}
        \untilSlide*{\showquerier}{\tinyclient[client-notflaming]{debian}}%
        \onlySlide*{\showquery}{\tinyclient[client-notflaming]{\textbf{debian}}}%
        \fromSlide*{\showboom}{\tinyclient[client-flaming]{\textbf{debian}}}
        & \fromSlide*{\showquery}
        {\psframebox[framesep=0pt,linecolor=red]{\icona} \iconb}%
        \untilSlide*{\showquerier}
        {\psframebox[framesep=0pt,linecolor=white]{\icona} \iconb} \\
        \tinyclient{disk1} & \icona \iconc \\
        \tinyclient{iso} & \icona \iconb \iconc \\
        \tinyclient{disk2} & \iconb \\
        \tinyclient{freebsd} & \iconc \\
      \end{tabular}

      \FromSlide{\showquerier}
      \begin{center}
        \client[client-notflaming]{``debian disk1?''} \\
        \fromSlide{\showquery}{\{\icona\}}
      \end{center}
    \end{minipage}

    \FromSlide{\showboom}
    \problem{Poor query load-balancing}
  \end{slide}
}
\egroup

% \begin{slide}{Index-Side Filtering}
%   \begin{itemize}
%   \item Metadata is small, so store \emph{entire} metadata in index
%   \item Send full query to index node
%   \item Index node performs filtering and returns only relevant
%     results
%   \item Can also include other \emph{filterable metadata}, e.g. file
%     size, MP3 bitrate, etc.
%   \end{itemize}
%   \problem{index nodes for popular keywords are overloaded}
% \end{slide}

\bgroup
\def\showa{2}
\def\showb{3}
\def\showc{4}
\def\showquerier{5}
\def\showquery{6}
\overlays{\showquery}{
  % One approach to this problem is to add network-side processing
  % * ...  Now you're all probably thinking that this is going to make
  %   the whole Internet catch on fire, but I'm going to try to
  %   convince you otherwise
  \begin{slide}{Keyword-Set Indexing}
    \onlySlide*{1}{%
      \begin{itemize}
      \item Build index on \emph{keyword sets} rather than keywords
      \item Store subsets of size $\le K$
      \item More keyword-set indexes, but each is shorter
      \item Single-keyword indexes are less important, so can be
        truncated
        \begin{itemize}
        \item <~29\% of web searches have only 1 keyword. [Reynolds \&
          Vahdat 2003]
        \end{itemize}
      \item To search: send filtered query to any $K$-size subset
        index
      \end{itemize}
    }
    \FromSlide{2}
    \vspace{-2em}
    \begin{minipage}{2in}
      \fromSlide{\showa}{\showrecords[a]}
      \fromSlide{\showb}{\showrecords[b]}
      \fromSlide{\showc}{\showrecords[c]}
    \end{minipage}
    \hfill
    \begin{minipage}{2in}
      \begin{tabular}{rlll}
        ($K = 2$) & \\
        \fromSlide{\showa}{\tinyclient{debian}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showa}{\tinyclient{disk1}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc} \\
        \fromSlide{\showa}{\tinyclient{iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb}
        \fromSlide{\showc}{\iconc} \\
        \fromSlide*{\showa}{%
          \untilSlide*{\showquerier}{\tinyclient{debian disk1}}%
          \fromSlide*{\showquery}{\tinyclient{\textbf{debian disk1}}}%
        }%
        & \fromSlide{\showa}{%
          \untilSlide*{\showquerier}
          {\psframebox[framesep=0pt,linecolor=white]{\icona}}%
          \fromSlide*{\showquery}
          {\psframebox[framesep=0pt,linecolor=red]{\icona}}%
        } \\
        \fromSlide{\showa}{\tinyclient{debian iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showa}{\tinyclient{disk1 iso}}
        & \fromSlide{\showa}{\icona} \fromSlide{\showc}{\iconc} \\
        \fromSlide{\showb}{\tinyclient{disk2}}
        & \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showb}{\tinyclient{debian disk2}}
        & \fromSlide{\showb}{\iconb} \\
        \fromSlide{\showb}{\tinyclient{disk2 iso}}
        & \fromSlide{\showb}{\iconb} \\
        % Bork!
        \untilSlide{0}{\tinyclient{\textbf{debian disk1}}}%
        \fromSlide{\showc}{$\vdots$}
      \end{tabular}
    
      \FromSlide{\showquerier}
      \begin{center}
        \client{``debian disk1?''}
      \end{center}
    \end{minipage}
  \end{slide}
}
\egroup

% \begin{slide}{Keyword-Set Indexing}
%   \begin{itemize}
%   \item Build index on \emph{keyword sets} rather than keywords
%   \item Store metadata block in index for each subset of size $\le K$
%   \item More keyword-set indexes, but each is shorter
%   \item Single-keyword indexes are less important, so can be truncated
%     \begin{itemize}
%     \item <~29\% of web searches have only 1 keyword. [Reynolds \&
%       Vahdat 2003]
%     \end{itemize}
%   \item To search: send filtered query to any $K$-size subset index
%   \end{itemize}
% \end{slide}

\newcommand{\Inastiness}{%
    \sum_{i = 1}^K \binom{m}{i} =
    \begin{cases}
      2^m - 1 & \text{if $m \le K$} \\
      O(m^K) & \text{if $m > K$}
    \end{cases}%
}
\overlays{3}{%
  \begin{slide}{Indexing Cost}
    \onlySlide*{1}{%
      \bgroup
      \setlength{\abovedisplayskip}{0pt}
      \begin{align*}
        m & = \text{metadata keywords} \\
        K & = \text{maximum subset size parameter} \\
        I(m) & = \text{index entries} \\
        & = \Inastiness{}
      \end{align*}
      \egroup
    }
    \FromSlide{2}
    \bgroup
    \setlength{\abovedisplayskip}{0pt}
    \begin{align*}
      I(m) & = \Inastiness{}
    \end{align*}
    \egroup
    \onlySlide*{2}{%
      \begin{center}
        \input{figures/egraph2d.tex}
      \end{center}}
    \onlySlide*{3}{%
      \begin{center}
        For files with many metadata keywords, $I(m)$ is polynomial in
        $m$.
      \end{center}}
  \end{slide}
}

\begin{slide}{Storage Costs (FreeDB)}
  \begin{center}
    \begin{tabular}{lr}
%      Number of discs & 1,582,836 \\
      Number of songs & 21,195,244 \\
      Total index entries ($K=1$) & 134,403,379 \\
      Index entries per song ($K=1$) & 6.274406 \\
      \hline
      Total index entries ($K=3$) & 1,494,688,373 \\
      Index entries per song ($K=3$) & 66.078093
    \end{tabular} \\
    \vspace{1em}
    $\Rightarrow$ Total storage cost only an order of magnitude more
    than required for $K=1$ inverted index.
    % XXX Compare against size of music data (deduce from track
    % lengths)
  \end{center}
\end{slide}

\begin{slide}{Choosing $K$}
  \begin{center}
    \begin{itemize}
    \item Larger $K$ improves query load distribution, increases
      indexing costs
    \end{itemize}
    \begin{center}
      \input{figures/index-size-increase-vs-k}
    \end{center}
    \vspace{-1em}
    \begin{center}
      For web searches: average query length 2.53
    \end{center}
  \end{center}
\end{slide}

\sectionslide{3} % Index Gateways

% Moved: Metadata Expiration

% XXX Mention expiration offhand somewhere else?

\overlays{6}{
  \begin{slide}{Index Gateways}
    \untilSlide*{3}{
      \begin{center}
        \onlySlide*{1}{\includegraphics{clipart/gateway1}}%
        \onlySlide*{2}{\includegraphics{clipart/gateway2}}%
        \onlySlide*{3}{\includegraphics{clipart/gateway3}}%
      \end{center}
      \begin{itemize}
      \item %
        \onlySlide*{1}{Each file has one metadata block, stored in
          $I(m)$ indexes.}%
        \onlySlide*{2}{Each peer sharing the file will insert the
          \emph{same} metadata block into each index.}%
        \onlySlide*{3}{Total insertion cost for $m$ metadata keywords
          and $s$ source peers: $sI(m)$ messages.}
      \end{itemize}

      \onlySlide*{3}{
        \begin{center}
          \problem{expensive and redundant}    
        \end{center}
      }
    }
    \FromSlide{4}
    \begin{itemize}
    \item Solution: aggregate updates at an \emph{index gateway}
    \item Receives metadata blocks from sources and sends to indexes
      only when necessary
    \end{itemize}
    \begin{center}
      \onlySlide*{4}{\includegraphics{clipart/gateway4}}%
      \onlySlide*{5}{\includegraphics{clipart/gateway5}}%
      \onlySlide*{6}{\includegraphics{clipart/gateway6}}
      
      Insertion cost is now $s + I(m)$ (vs. $s I(m)$)!
    \end{center}
  \end{slide}
}

% Moved: Index Replication

\sectionslide{4} % Content Distribution

\begin{slide}{Direct Storage?}
  % * ...  So how do we get from file IDs to sources?
  \begin{itemize}
  \item Content is large
  \item Network has churn
    \begin{itemize}
    \item Kazaa median session length 2.4 minutes [Gummadi et
      al. 2003]
    \end{itemize}
  \end{itemize}
  \problem{DHT storage of content is impractical}
\end{slide}
\begin{slide}{Indirect Storage}
  \begin{itemize}
  \item Add indirection
  \item Store only small pointers in the network
  \end{itemize}
  \begin{align*}
    % XXX Do this sanely and remove amscd package
    \begin{CD}
      \text{keywords} \\
      @VV{\text{keyword-set index search}}V \\
      \text{file IDs} \\
      @VV{\text{}}V \\
      \text{sources}
    \end{CD}
  \end{align*}
\end{slide}

% Moved: Segmentation

\begin{slide}{Content-Sharing Subrings}
  \begin{itemize}
  \item How to identify sources for a file?
  \item Simple solution: ``tracker node''
  \item Instead, file sources form subrings
    \begin{itemize}
    \item To find source, \proc{Lookup} random ID in file's subring
    \item Search and maintenance costs same as with tracker, but
      distributed over network
    \item No single point of failure
    \end{itemize}
  \end{itemize}
  \vspace{-1em}
%   \begin{align*}
%     % XXX Do this sanely and remove amscd package
%     \begin{CD}
%       \text{keywords} \\
%       @VV{\text{keyword-set index search}}V \\
%       \text{file IDs} \\
%       @VV{\text{standard DHT lookup}}V \\
%       \text{chunk IDs} \\
%       @VV{\text{content-sharing subring}}V \\
%       \text{sources}
%     \end{CD}
%   \end{align*}
\end{slide}

% Moved: Postfetching

\sectionslide{5} % Conclusion

\begin{slide}{Conclusion}
  \begin{itemize}
  \item Supports search with distributed keyword-set index
  \item Extends standard DHT interface with network-side processing
    \begin{itemize}
    \item Index-side filtering
    \item Index gateways
    \end{itemize}
  \item Content distribution with indirect storage
    \begin{itemize}
    \item Indexing via subrings
%    \item Postfetching to increase availability
    \end{itemize}
%  \item The future is here!
  \end{itemize}
\end{slide}

\begin{slide}{}
\end{slide}

\begin{slide}{Bonus Section}
\end{slide}

\begin{slide}{Bonus 1 -- Postfetching}
  \begin{itemize}
  \item Indirect storage leads to unavailable content
  \item Long metadata expiration leads to visible, unavailable content
  \item Insert request blocks into the network
  \item When source node rejoins and locates request blocks actively
    push requested content into the caches of randomly-selected nodes.
  \end{itemize}
\end{slide}


\begin{slide}{Bonus 2 -- Metadata Expiration}
  \begin{itemize}
  \item File availability constantly changes as peers join/leave
  \item Expiration rather than polling
  \item Peers must be able to handle references to unavailable files
  \item Long expiration times to track access attempts for unavailable
    files
  \end{itemize}
\end{slide}

\begin{slide}{Bonus 3 -- Index Replication}
  \begin{itemize}
  \item Replication instead of erasure coding
  \item Only weak consistency required
  \item Updates propagated periodically
  \item Expirations performed independently
  \end{itemize}
  % XXX Is there more to say?
\end{slide}

\overlays{3}{
  \begin{slide}{Bonus 4 -- Segmentation}
    \onlySlide*{1}{%
      \begin{itemize}
      \item At this level, content is atomic
      \item Can't share partially downloaded content
      \end{itemize}
      \problem{Doesn't utilize upload bandwidth}
    }
    \onlySlide*{2}{%
      \begin{itemize}
      \item Split content into chunks and share on the chunk level
      \item Typical file sharing networks contain many similar files
      \end{itemize}
      \problem{Underutilization of content sources}
    }
    \onlySlide*{3}{%
      \begin{itemize}
      \item Place chunk boundaries based on content data
      \item Identify chunks by hash of their contents
      \item Files are now just a list of their constituent chunk IDs
      \end{itemize}
      \vspace{-1em}
      \begin{tiny}
        \begin{align*}
          % XXX Do this sanely and remove amscd package
          \begin{CD}
            \text{keywords} \\
            @VV{\text{keyword-set index search}}V \\
            \text{file IDs} \\
            @VV{\text{standard DHT lookup}}V \\
            \text{chunk IDs} \\
            @VV{\text{content-sharing subring}}V \\
            \text{sources}
          \end{CD}
        \end{align*}
      \end{tiny}
    }
  \end{slide}
}

\end{document}

% LocalWords:  Metadata metadata