From de4c390f70451916f28b8498e247eb8bd2b02bef Mon Sep 17 00:00:00 2001 From: Thomas Forgione Date: Wed, 27 Nov 2019 22:37:49 +0100 Subject: [PATCH] Updates --- src/conclusion/contributions.tex | 2 +- src/dash-3d/client.tex | 21 +++++++++++---------- src/dash-3d/conclusion.tex | 2 +- src/dash-3d/content-preparation.tex | 12 ++++++------ src/dash-3d/evaluation.tex | 8 ++++---- src/dash-3d/introduction.tex | 2 +- src/preliminary-work/bookmarks-impact.tex | 12 ++++++------ src/preliminary-work/streaming.tex | 7 ++++--- src/state-of-the-art/3d-interaction.tex | 6 +++--- src/state-of-the-art/3d-streaming.tex | 2 +- src/state-of-the-art/video.tex | 22 +++++++++++----------- src/system-bookmarks/bookmark.tex | 9 +++++---- src/system-bookmarks/main.tex | 2 +- 13 files changed, 55 insertions(+), 52 deletions(-) diff --git a/src/conclusion/contributions.tex b/src/conclusion/contributions.tex index 6011824..7ebad33 100644 --- a/src/conclusion/contributions.tex +++ b/src/conclusion/contributions.tex @@ -14,7 +14,7 @@ This work has been published at the ACM MMSys conference in 2016~\citep{bookmark \paragraph{} After studying the interactive aspect of 3D navigation, we proposed a contribution focusing on the streaming aspect of such a system. The objective of this contribution wass to introduce a system able to perform \textbf{scalable, view-dependent 3D streaming}. -This new framework brings many improvements upon the basic system described in our first contribution: support for texture, externaliration of necessary computations from the server to the clients, support for multi-resolution textures, rendering performances considerations. +This new framework brings many improvements upon the basic system described in our first contribution: support for texture, externalisation of necessary computations from the server to the clients, support for multi-resolution textures, rendering performances considerations. We drew massive inspiration from the DASH technology, a standard for video streaming used for its scalability and its adaptability. We exploit the fact that DASH is made to be content agnostic to fit 3D content into its structure. Following the path set by DASH-SRD, we propose to tile 3D content using a tree and encode this partition into a description file (MPD) to allow view-dependent streaming, without the need for computation on the server side. diff --git a/src/dash-3d/client.tex b/src/dash-3d/client.tex index c599fc2..6cb74ad 100644 --- a/src/dash-3d/client.tex +++ b/src/dash-3d/client.tex @@ -1,6 +1,6 @@ \section{Client\label{d3:dash-client}} -In this section, we specify a DASH NVE client that exploits the preparation of the 3D content in an NVE for streaming. +In this section, we specify a DASH NVE client which exploits the preparation of the 3D content in an NVE for streaming. The generated MPD file describes the content organization so that the client gets all the necessary information to make educated decisions and query the 3D content it needs according to the available resources and current viewpoint. A camera path generated by a particular user is a set of viewpoint $v(t_i)$ indexed by a continuous time interval $t_i \in [t_1,t_{end}]$. @@ -96,8 +96,8 @@ These parameters are stored in the MPD file. First, for each geometry segment $s^G$ there is a predetermined 3D area $\mathcal{A}_{3D}(s^G)$, equal to the sum of all triangle areas in this segment (in 3D); it is computed as the segments are created. Note that the texture segments have similar information, but computed at \textit{navigation time} $t_i$. The second information stored in the MPD for all segments, geometry, and texture, is the size of the segment (in kB). -Indeed, geometry segments have close to a similar number of faces; their size is almost uniform. -For texture segments, the size is usually much smaller than the geometry segments but also varies a lot, as between two successive resolutions the number of pixels is divided by 4. +\update{Indeed, geometry segments have a similar number of faces; their size is almost uniform. +For texture segments, the size is usually much smaller than the geometry segments but also varies a lot, as between two successive resolutions the number of pixels is divided by 4.}{} Finally, for each texture segment $s^{T}$, the MPD stores the \textit{MSE} (mean square error) of the image and resolution, relative to the highest resolution (by default, triangles are filled with its average color). Offline parameters are stored in the MPD as shown in Listing~\ref{d3:mpd}. @@ -162,6 +162,7 @@ Algorithm~\ref{d3:next-segment} details how our DASH client makes decisions. \SetKwData{Bw}{bw\_estimation} \SetKwData{Rtt}{rtt\_estimation} \SetKwData{Segment}{best\_segment} + \SetKwData{CurrentSegment}{segment} \SetKwData{Candidates}{candidates} \SetKwData{AllSegments}{all\_segments} \SetKwData{DownloadedSegments}{downloaded\_segments} @@ -171,20 +172,20 @@ Algorithm~\ref{d3:next-segment} details how our DASH client makes decisions. \SetKwFunction{EstimateNetwork}{estimate\_network\_parameters} \SetKwFunction{Append}{append} - \Input{Current index $i$, time $t_i$, viewpoint $v(t_i)$, buffer of already downloaded \texttt{segments} $\mathcal{B}_i$, MPD} + \Input{Current index $i$, time $t_i$, viewpoint $v(t_i)$, buffer of already downloaded \texttt{segments} $\mathcal{B}_i$, MPD, utility metric $\mathcal{U}$, streaming policy $\Omega$} \Output{Next segment $s^{*}_i$ to request, updated buffer $\mathcal{B}_{i+1}$} \BlankLine{} (\Bw, \Rtt) \leftarrow{} \EstimateNetwork{}\; \BlankLine{} - \Candidates\leftarrow{} \AllSegments\newline\makebox[1cm]{}.\Filter{$\Segment\rightarrow\Segment\notin\DownloadedSegments$}\newline\makebox[1cm]{}.\Filter{$\Segment\rightarrow\Segment\in\Frustum$}\; + \Candidates\leftarrow{} \AllSegments\newline\makebox[1cm]{}.\Filter{$\CurrentSegment\rightarrow\CurrentSegment\notin\DownloadedSegments$}\newline\makebox[1cm]{}.\Filter{$\CurrentSegment\rightarrow\CurrentSegment\in\Frustum$}\; \BlankLine{} - \Segment\leftarrow{} \Argmax{\Candidates, \Segment\rightarrow{} $\Omega\left(\mathcal{U}(\Segment)\right)$}\; + \Segment\leftarrow{} \Argmax{\Candidates, \CurrentSegment\rightarrow{} $\Omega\left(\mathcal{U},\CurrentSegment\right)$}\; \DownloadedSegments.\Append{\Segment}\; {\caption{Algorithm to identify the next segment to query\label{d3:next-segment}}} \end{algorithm} -The most naive way to sequentially optimize the $\mathcal{U}$ is to limit the decision-making to the current viewpoint $v(t_i)$. +A naive way to sequentially optimize the utility $\mathcal{U}$ is to limit the decision-making to the current viewpoint $v(t_i)$. In that case, the best segment $s$ to request would be the one maximizing $\mathcal{U}(s, v(t_i))$ to simply make a better rendering from the current viewpoint $v(t_i)$. Due to transmission delay however, this segment will be only delivered at time $t_{i+1}=t_{i+1}(s)$ depending on the segment size and network conditions: \begin{equation*} t_{i+1}(s)=t_i+\frac{\mathtt{size}(s)}{\widehat{BW_i}} + \widehat{\tau_i}\label{d3:eq2}\end{equation*} @@ -342,13 +343,13 @@ The \texttt{DashLoader} class accepts as parameter a function that will be calle \subsubsection{Performance} -In JavaScript, there is no way of doing parallel computing without using \emph{web workers}. +\update{In JavaScript, there is no way of doing parallel computing without using \emph{web workers}.}{Javascript requires the use of \emph{web workers} to perform parallel computing.} A web worker is a script in JavaScript that runs in the background, on a separate thread and that can communicate with the main script by sending and receiving messages. Since our system has many tasks to perform, it is natural to use workers to manage the streaming without impacting the framerate of the renderer. However, what a worker can do is very limited, since it cannot access the variables of the main script. -Because of this, we are forced to run the renderer on the main script, where it can access the HTML page, and we move all the other tasks (i.e. the access client, the control engine and the segment parsers) to the worker. +Because of this, we are forced to run the renderer on the main script, where it can access the HTML page, and we move all the other tasks (i.e.\ the access client, the control engine and the segment parsers) to the worker. Since the main script is the only thread communicating with the GPU, it will still have to update the model with the parsed content it receives from the worker. -Using a worker does not so much improve the framerate of the system, but it reduces the latency that occurs when receiving a new segment, which can be very frustrating since in a single thread scenario, each time a segment is received, the interface freezes for around half a second. +\update{Using a worker does not so much improve the framerate of the system, but it reduces}{We do not use web workers to improve the framerate of the system, but to reduce} the latency that occurs when receiving a new segment, which can be frustrating since in a single thread scenario, each time a segment is received, the interface freezes for around half a second. A sequence diagram of what happens when downloading, parsing and rendering content is shown in Figure~\ref{d3:sequence}. \begin{figure}[ht] diff --git a/src/dash-3d/conclusion.tex b/src/dash-3d/conclusion.tex index f26c024..cb44144 100644 --- a/src/dash-3d/conclusion.tex +++ b/src/dash-3d/conclusion.tex @@ -4,7 +4,7 @@ Our work in this chapter started with the question: can DASH be used for NVE\@? The answer is \emph{yes}. In answering this question, we contributed by showing how to organize a polygon soup and its textures into a DASH-compliant format that (i) includes a minimal amount of metadata that is useful for the client, (ii) organizes the data to allow the client to get the most useful content first. We further show that the data organisation and its description with metadata (precomputed offline) is sufficient to design and build a DASH client that is adaptive --- it selectively downloads segments within its view, makes intelligent decisions about what to download, balances between geometry and texture while adapting to network bandwidth. -This way, our system addresses the open problems we mentioned in~\ref{i:challenges}. +This way, our system addresses the open problems we mentioned in Chapter~\ref{i:challenges}. \begin{itemize} \item \textbf{It prepares and structures the content in a way that enables streaming}: all this preparation is precomputed, and all the content is structured according to DASH framework, geometry but also materials and textures. Furthermore, textures are prepared in a multi-resolution manner, and even though multi-resolution geometry is not discussed here, the difficulty of integrating it in this system seem moderated: we could encode levels of detail in different representations and define a utility metric for each representation and the system should adapt naturally. diff --git a/src/dash-3d/content-preparation.tex b/src/dash-3d/content-preparation.tex index d253219..b74f4e7 100644 --- a/src/dash-3d/content-preparation.tex +++ b/src/dash-3d/content-preparation.tex @@ -5,17 +5,17 @@ In our work, we use the \texttt{obj} file format for the polygons, \texttt{png} The process, however, applies to other formats as well. \subsection{The MPD File} -In DASH, the information about content storage and characteristics, such as location, resolution, or size, are extracted from an MPD file by the client. -The client relies only on these information to decide which chunk to request and at which quality level. +In DASH, the information about content storage and characteristics, such as location, resolution, or size, is extracted from an MPD file by the client. +The client relies only on this information to decide which chunk to request and at which quality level. The MPD file is an XML file that is organized into different sections hierarchically. The \texttt{period} element is a top-level element, which for the case of video, indicates the start time and length of a video chapter. This element does not apply to NVE, and we use a single \texttt{period} for the whole scene, as the scene is static. Each \texttt{period} element contains one or more adaptation sets, which describe the alternate versions, formats, and types of media. We utilize adaptation sets to organize a 3D scene's material, geometry, and texture. -The piece of software that does the preprocessing of the model mostly consists in file manipulation and is written is Rust as well. +The piece of software that does the preprocessing of the model consists in file manipulation and is written in Rust as well. It successively preprocesses the geometry and then the textures. -The MPD is generated by a library named \href{https://github.com/netvl/xml-rs}{xml-rs} that works like a stack: +The MPD is generated by a library named \href{https://github.com/netvl/xml-rs}{xml-rs} which works like a stack: \begin{itemize} \item a structure is created on the root of the MPD file; \item the \texttt{start\_element} method creates a new child in the XML file; @@ -33,7 +33,7 @@ A face belongs to a cell if its barycenter falls inside the corresponding boundi Each cell corresponds to an adaptation set. Thus, geometry information is spread on adaptation sets based on spatial coherence, allowing the client to download the relevant faces selectively. A cell is relevant if it intersects the frustum of the client's current viewpoint. Figure~\ref{d3:big-picture} shows the relevant cells in green. -As our 3D content, a virtual environment, is biased to spread the most along the horizontal plane, we alternate between splitting between the two horizontal directions. +As our 3D content, a virtual environment, is biased to spread along the horizontal plane, we alternate between splitting between the two horizontal directions. We create a separate adaptation set for large faces (e.g., the sky or ground) because they are essential to the 3D model and do not fit into cells. We consider a face to be large if its area in 3D is more than $a+3\sigma$, where $a$ and $\sigma$ are the average and the standard deviation of 3D area of faces respectively. @@ -82,7 +82,7 @@ Figure~\ref{d3:textures} illustrates the use of the textures against the renderi \subsection{Segments} To allow random access to the content within an adaptation set storing geometry data, we group the faces into segments. -Each segment is then stored as a \texttt{.obj} file that can be individually requested by the client. +Each segment is then stored as a \texttt{.obj} file which can be individually requested by the client. For geometry, we partition the faces in an adaptation set into sets of $N_s$ faces, by first sorting the faces by their area in 3D space in descending order, and then place each successive $N_s$ faces into a segment. Thus, the first segment contains the biggest faces and the last one the smallest. In addition to the selected faces, a segment stores all face vertices and attributes so that each segment is independent. diff --git a/src/dash-3d/evaluation.tex b/src/dash-3d/evaluation.tex index a6eb18e..6f629ef 100644 --- a/src/dash-3d/evaluation.tex +++ b/src/dash-3d/evaluation.tex @@ -27,13 +27,13 @@ We partition the geometry into a k-$d$ tree until the leafs have less than 10000 \end{table} \subsubsection{User Navigations} -To evaluate our system, we collected realistic user navigation traces that we can replay in our experiments. +To evaluate our system, we collected realistic user navigation traces which we can replay in our experiments. We presented six users with a web interface, on which the model was loaded progressively as the user could interact with it. The available interactions were inspired by traditional first-person interactions in video games, i.e., W, A, S, and D keys to translate the camera, and mouse to rotate the camera. We asked users to browse and explore the scene until they felt they had visited all important regions. We then asked them to produce camera navigation paths that would best present the 3D scene to a user that would discover it. To record a path, the users first place their camera to their preferred starting point, then click on a button to start recording. -Every 100ms, the position, viewing angle of the camera and look-at point are saved into an array that will then be exported into JSON format. +Every 100ms, the position, viewing angle of the camera and look-at point are saved into an array which will then be exported into JSON format. The recorded camera trace allows us to replay each camera path to perform our simulations and evaluate our system. We collected 13 camera paths this way. @@ -166,7 +166,7 @@ An online-only utility improves the results, as it takes the user viewing frustu \end{figure} Figure~\ref{d3:sorting} shows the effect of grouping the segments in an adaptation set based on their area in 3D. -Clearly, the PSNR significantly improves when the 3D area of faces is considered for creating the segments. Since all segments are of the same size, sorting the faces by area before grouping them into segments leads to a skew distribution of how useful the segments are. This skewness means that the decision that the client makes (to download those with the largest utility first) can make a bigger difference in the quality. +The PSNR significantly improves when the 3D area of faces is considered for creating the segments. Since all segments are of the same size, sorting the faces by area before grouping them into segments leads to a skew distribution of how useful the segments are. This skewness means that the decision that the client makes (to download those with the largest utility first) can make a bigger difference in the quality. We also compared the greedy vs.\ proposed streaming policy (as shown in Figure~\ref{d3:greedy-weakness}) for limited bandwidth (5 Mbps). The proposed scheme outperforms the greedy during the first 30s and does a better job overall. @@ -175,7 +175,7 @@ In the first 30 sec, since there are relatively few 3D contents downloaded, maki Table~\ref{d3:percentages} shows the distribution of texture resolutions that are downloaded by greedy and our Proposed scheme, at different bandwidths. Resolution 5 is the highest and 1 is the lowest. -The table clearly shows a weakness of the greedy policy: as the bandwidth increases, the distribution of downloaded textures resolution stays more or less the same. +The table shows a weakness of the greedy policy: \update{as the bandwidth increases, the distribution of downloaded textures resolution stays more or less the same.}{the distributioon of downloaded textures does not adapt to the bandwidth.} In contrast, our proposed streaming policy adapts to an increasing bandwidth by downloading higher resolution textures (13.9\% at 10 Mbps, vs. 0.3\% at 2.5 Mbps). In fact, an interesting feature of our proposed streaming policy is that it adapts the geometry-texture compromise to the bandwidth. The textures represent 57.3\% of the total amount of downloaded bytes at 2.5 Mbps, and 70.2\% at 10 Mbps. In other words, our system tends to favor geometry segments when the bandwidth is low, and favor texture segments when the bandwidth increases. diff --git a/src/dash-3d/introduction.tex b/src/dash-3d/introduction.tex index fc98535..f2388c4 100644 --- a/src/dash-3d/introduction.tex +++ b/src/dash-3d/introduction.tex @@ -3,7 +3,7 @@ In this chapter, we take a little step back from interaction and propose a system with simple interactions that however, addresses most of the open problems mentioned in Section~\ref{i:challenges}. We take inspiration from video streaming: working on the similarities between video streaming and 3D streaming (seen in~\ref{i:video-vs-3d}), we benefit from the DASH efficiency (seen in~\ref{sote:dash}) for streaming 3D content. DASH is based on content preparation and structuring which helps not only the streaming policies but also leads to a scalable and efficient system since it moves completely the load from the server to the clients. -A DASH client is simply a client that downloads the structure of the content, and then, depending on its needs independently of the server, decides what to download. +A DASH client downloads the structure of the content, and then, depending on its needs independently of the server, decides what to download. In this chapter, we show how to mimic DASH video with 3D streaming, and we develop a system that keeps DASH benefits. Section~\ref{d3:dash-3d} describes our content preparation and metadata, and all the preprocessing that is done to our model to allow efficient streaming. diff --git a/src/preliminary-work/bookmarks-impact.tex b/src/preliminary-work/bookmarks-impact.tex index 34e0c06..884dae2 100644 --- a/src/preliminary-work/bookmarks-impact.tex +++ b/src/preliminary-work/bookmarks-impact.tex @@ -47,18 +47,18 @@ When reaching the bookmark, the corresponding arrow or viewport is not visible a We now describe in details our experimental setup and the user study that we conducted on 3D navigation. \subsubsection{Models} -We use four 3D scenes (one for the tutorial and three for the actual experiments) that represent recreated scenes from a famous video game. +We use four 3D scenes (one for the tutorial and three for the actual experiments) which represent recreated scenes from a famous video game. Those models are light (a few thousand of triangles per model) and are sent before the experiment starts. We keep the models small so that users can perform the task with acceptable latency from any country using a decent Internet connection. -Our NVE does not actually stream the 3D content for these experiments, in order to avoid unreliable conditions caused by the network bandwidth variation, which might affect how the users interact. +Our NVE does not stream the 3D content for these experiments, in order to avoid unreliable conditions caused by the network bandwidth variation, which might affect how the users interact. \subsubsection{Task design} -Since we are interested in studying how efficiently users navigate in the 3D scene, we ask our participants to complete a task that forces them to visit, at least partially, various regions in the scene. +Since we are interested in studying how efficiently users navigate in the 3D scene, we ask our participants to complete a task which forces them to visit, at least partially, various regions in the scene. To this end, we hide a set of 8 coins on the scene: participants are asked to collect the coins by clicking on them. In order to avoid any bias due to the coins position, we predefined 50 possible coin locations all around the scene, and randomly select 8 out of these 50 positions each time a new participant starts the experiment. \subsubsection{Experiment} -Participants are first presented with an initial screen to collect some preliminary information: age, gender, the last time they played 3D video games, and self-rated 3D gaming skills. We ask those questions because we believe that someone who is used to play 3D video games should browse the scene more easily, and thus, may not need to use our bookmarks. +Participants are first presented with an initial screen to collect some preliminary information: age, gender, the last time they played 3D video games, and self-rated 3D gaming skills. We ask those questions because we believe that someone who is used to playing 3D video games should browse the scene more easily, and thus, may not need to use our bookmarks. Then, the participants go through a tutorial to learn how the UI works, and how to complete the task. The different interactions (keyboard navigation, mouse navigation, bookmarks interaction) are progressively introduced to participants, and the tutorial ends once the participant completes an easy version of the task. @@ -174,7 +174,7 @@ The table shows that this higher speed is due to the bookmarks, as more than 60\ In the previous paragraphs, we have shown how bookmarks are well perceived by users (looking at the questionnaire answers). We also showed that users tend to be more efficient in completing the task when they have bookmarks than when they do not. -We can say that bookmarks have a positive effect on navigation within the 3D scene, but since users move, on average, twice as fast, it might have a negative impact on the streaming of objects to the client. +We can say that bookmarks have a positive impact on navigation within the 3D scene, but since users move, on average, twice as fast, it might have a negative impact on the streaming of objects to the client. \begin{figure}[th] \centering @@ -205,4 +205,4 @@ Figure~\ref{bi:triangles-curve} shows a CDF of the percentage of 3D mesh triangl As expected, the fact that the users can browse the scene significantly quicker with bookmarks reflects on the demand on the 3D content. Users need more triangles more quickly, which either leads to more demand on network bandwidth, or if the bandwidth is kept constant, leads to fewer objects being displayed. In the next section, we introduce experiments based on our user study traces that show how the rendering is affected by the presence of bookmarks and how to improve it. - +\update{}{We found no significant correlation between the performance at the task and the age of the users or their skills in videogames.} diff --git a/src/preliminary-work/streaming.tex b/src/preliminary-work/streaming.tex index d6536d4..ed2240a 100644 --- a/src/preliminary-work/streaming.tex +++ b/src/preliminary-work/streaming.tex @@ -164,7 +164,8 @@ The first part is used to fetch the content from the current viewpoint, using th The second part is used to prefetch content from the bookmarks, according to their likelihood of being clicked next. We use the probabilities displayed in Figure~\ref{bi:mat1} to determine the size of each part. Each bookmark $B$ has a probability $p(B|B_{prev})$ of being clicked next, considering that $B_{prev}$ was the last clicked bookmark. -We assign to each bookmark $p(B|B_{prev})/2$ of the chunk to prefetch the corresponding data. +\update{We assign to each bookmark $p(B|B_{prev})/2$ of the chunk to prefetch the corresponding data.}{% +We assign to each bookmark a certain portion of the chunk to prefetch the corresponding data proportionally to the probability of it being clicked.} We use the \textsf{visible} policy to determine which data should be sent for a bookmark. We denote this combination as \textsf{V-PP}, for Prefetching based on Prediction using \textsf{visible} policy. @@ -300,9 +301,9 @@ More quantitatively, with a $1$ Mbps bandwidth, 3 seconds are necessary after th Figure~\ref{bi:click-625} showed the results of the same experiment with 0.5 Mbps bandwidth. Here, it takes 4 to 5 seconds to recover $85\%$ of the pixels with \textsf{culling} and \textsf{V-PP}, against 1.5 second for recovering $90\%$ with \textsf{V-FD}. -Combining both strategies (\textsf{V-PP+FD} leads to the best quality. +Combining both strategies (\textsf{V-PP+FD}) leads to the best quality. -At 1 Mbps bandwidth, \textsf{V-PP} penalizes the quality, as the curve \textsf{V-PP-FD}) leads to a lower quality image than \textsf{V-FD} alone. +At 1 Mbps bandwidth, \textsf{V-PP} penalizes the quality, as the curve \textsf{V-PP-FD} leads to a lower quality image than \textsf{V-FD} alone. This effect is even stronger when the bandwidth is set to 2 Mbps (Figure~\ref{bi:2MB}). Both streaming strategies based on the pre-computation of the ordering improves the image quality. We see here, that \textsf{V-FD} has a greater impact than \textsf{V-PP}. Here, \textsf{V-PP} may prefetch content that eventually may not be used, whereas \textsf{V-FD} only sends relevant 3D content (knowing which bookmark has been just clicked). diff --git a/src/state-of-the-art/3d-interaction.tex b/src/state-of-the-art/3d-interaction.tex index f719976..385562b 100644 --- a/src/state-of-the-art/3d-interaction.tex +++ b/src/state-of-the-art/3d-interaction.tex @@ -1,8 +1,8 @@ \section{3D Bookmarks and Navigation Aids} -The only use for 3D streaming is to allow users interacting with the content while it is being downloaded. +One of the uses for 3D streaming is to allow users interacting with the content while it is being downloaded. However, devising an ergonomic technique for browsing 3D environments through a 2D interface is difficult. -Controlling the viewpoint in 3D (6 DOFs) with 2D devices is not only inherently challenging but also strongly task-dependent. In their review,~\citep{interaction-3d-environment} distinguish between several types of camera movements: general movements for exploration (e.g., navigation with no explicit target), targeted movements (e.g., searching and/or examining a model in detail), specified trajectory (e.g., a cinematographic camera path), etc. +Controlling the viewpoint in 3D (6 DOFs) with 2D devices is not only inherently challenging but also strongly task-dependent. In their review,~\citep{interaction-3d-environment} distinguish between several types of camera movements: general movements for exploration (e.g., navigation with no explicit target), targeted movements (e.g., searching and/or examining a model in detail), specified trajectory (e.g., a cinematographic camera path). For each type of movement, specialized 3D interaction techniques can be designed. In most cases, rotating, panning, and zooming movements are required, and users are consequently forced to switch back and forth among several navigation modes, leading to interactions that are too complicated overall for a layperson. Navigation aids and smart widgets are required and subject to research efforts both in 3D companies (see \url{sketchfab.com}, \url{cl3ver.com} among others) and in academia, as reported below. @@ -34,7 +34,7 @@ Such viewpoints can be either static, or dynamically adapted:~\citep{dual-mode-u Early 3D VRML environments \citep{browsing-3d-bookmarks} offer 3D bookmarks with animated transitions between bookmarked views. These transitions prevent disorientation since users see how they got there. Hyperlinks can also ease rapid movements between distant viewpoints and naturally support non-linear and non-continuous access to 3D content. -Navigating with 3D hyperlinks is potentially faster, but is likely to cause disorientation, as shown by the work of~\citep{ve-hyperlinks}. +Navigating with 3D hyperlinks is faster due to the instant motion, but can cause disorientation, as shown by the work of~\citep{ve-hyperlinks}. \citep{linking-behavior-ve} examine explicit landmark links as well as implicit avatar-chosen links in Second Life. These authors point out that linking is appreciated by users and that easing linking would likely result in a richer user experience. \citep{dual-mode-ui} developed the Dual-Mode User Interface (DMUI) that coordinates and links hypertext to 3D graphics in order to access information in a 3D space. diff --git a/src/state-of-the-art/3d-streaming.tex b/src/state-of-the-art/3d-streaming.tex index 5ccaa2d..3c9e4fd 100644 --- a/src/state-of-the-art/3d-streaming.tex +++ b/src/state-of-the-art/3d-streaming.tex @@ -101,7 +101,7 @@ However, users are often interested in scenes that contain multiple meshes, and To answer those issues, the Khronos group proposed a generic format called glTF (GL Transmission Format,~\citep{gltf}) to handle all types of 3D content representations: point clouds, meshes, animated model, etc.\ glTF is based on a JSON file, which encodes the structure of a scene of 3D objects. It contains a scene graph with cameras, meshes, buffers, materials, textures and animations. -Although relevant for compression, transmission and in particular streaming, this standard does not yet consider view-dependent streaming which is required for large scene remote visualisation and that we address in our work. +Although relevant for compression, transmission and in particular streaming, this standard does not yet consider view-dependent streaming, which is required for large scene remote visualisation and which we address in our work. % Zampoglou diff --git a/src/state-of-the-art/video.tex b/src/state-of-the-art/video.tex index 27f6e6d..6b38871 100644 --- a/src/state-of-the-art/video.tex +++ b/src/state-of-the-art/video.tex @@ -3,19 +3,19 @@ Accessing a remote video through the Web has been a widely studied problem since the 1990s. The Real-time Transport Protocol (RTP,~\cite{rtp-std}) has been an early attempt to formalize audio and video streaming. The protocol allowed data to be transferred unilaterally from a server to a client, and required the server to handle a separate session for each client. While this protocol can be useful in particular scenarii, such as video-conferencing, it can not realistically scale to modern video streaming platforms such as Youtube or Netflix, which must serve millions of simultaneous clients. -Because of this limitation, and while the increasing network capabilities made video streaming a more and more common practice, a new trend emerged during the 2000s. Building on the democratization of HTTP servers, many industrial actors (Apple, Microsoft, Adobe, etc.) developed HTTP streaming systems to deliver multimedia content over the network. In an effort to bring interoperability between all different actors, the MPEG group launched an initiative which eventually became a standard known as DASH, Dynamic Adaptive Streaming over HTTP. +Because of this limitation, and while the increasing network capabilities made video streaming a more and more common practice, a new trend emerged during the 2000s. Building on the democratization of HTTP servers, many industrial actors (Apple, Microsoft, Adobe, etc.) developed HTTP streaming systems to deliver multimedia content over the network. In an effort to bring interoperability between all different actors, the MPEG group launched an initiative, which eventually became a standard known as DASH, Dynamic Adaptive Streaming over HTTP\@. \subsection{DASH\@: the standard for video streaming\label{sote:dash}} Dynamic Adaptive Streaming over HTTP (DASH), or MPEG-DASH \citep{dash-std,dash-std-2}, is now a widely deployed standard for adaptively streaming video on the Web \citep{dash-std-full}, made to be simple, scalable and inter-operable. -DASH describes guidelines to prepare and structure video content, in order to allow a great adaptability of the streaming without requiring any server side computation. The client should be able to make good decisions on what part of the content should be downloaded, only based on an estimation of the network constraints and on the information provided in a descriptive file: the MPD. +DASH describes guidelines to prepare and structure video content, in order to allow a great adaptability of the streaming without requiring any server side computation. The client should be able to make good decisions on what part of the content should be downloaded, only based on an estimation of the network constraints and on the information provided in a descriptive file: the MPD\@. \subsubsection{DASH structure} All the content structure is described in a Media Presentation Description (MPD) file, written in the XML format. -This file has 4 layers: the periods, the adaptation sets, the representations and the segments. -A MPD has a hierarchical structure, meaning that it has multiple periods, and that each period can have multiple adaptation sets, that each adaptation set can have multiple representation, and that each representation can have multiple segments. +This file has 4 layers: the periods, the adaptation sets, the representations, and the segments. +A MPD has a hierarchical structure, meaning it has multiple periods, and each period can have multiple adaptation sets, each adaptation set can have multiple representation, and each representation can have multiple segments. \paragraph{Periods.} Periods are used to delimit content depending on time. @@ -28,13 +28,13 @@ In videos, most of the time, each period has at least one adaptation set contain It may also have an adaptation set for subtitles. \paragraph{Representations.} -The representation level is the level DASH uses to offer the same content at different levels of resolution. -For example, an adaptation set containing images has a representation for each available resolution (it might be 480p, 720p, 1080p, etc.). -This allows a user to choose its representation and change it during the video, but most importantly, since the software is able to estimate its downloading speed based on the time it took to download data in the past, it is able to find the optimal resolution, being the highest resolution that the client can request without stalling. +The representation level is the level DASH uses to offer the same content at different levels of quality. +For example, an adaptation set containing images has a representation for each available quality (it might be 480p, 720p, 1080p, etc.). +This allows a user to choose its representation and change it during the video, but most importantly, since the software is able to estimate its downloading speed based on the time it took to download data in the past, it is able to find the optimal representation, being the highest quality that the client can request without stalling. \paragraph{Segments.} Until this level in the MPD, content has been divided but it is still far from being sufficiently divided to be streamed efficiently. -In fact, a representation of the images of a chapter of a movie is still a long video, and keeping such a big file is not possible since heavy files prevent streaming adaptability: if the user requests to change the level of resolution of a video, the system would either have to wait until the file is totally downloaded, or cancel the request, making all the progress done unusable. +A representation of the images of a chapter of a movie is still a long video, and keeping such a big file is not possible since heavy files prevent streaming adaptability: if the user requests to change the quality of a video, the system would either have to wait until the file is totally downloaded, or cancel the request, making all the progress done unusable. Segments are used to prevent this issue. They typically encode files that contain two to ten seconds of video, and give the software a greater ability to dynamically adapt to the system. @@ -42,8 +42,8 @@ If a user wants to seek somewhere else in the video, only one segment of data is \subsubsection{Content preparation and server} -Encoding a video in DASH format consists in partitioning the content into periods, adaptation sets, representations and segments as explained above, and generating a Media Presentation Description file (MPD) that describes this organisation. -Once the data is prepared, it can simply be hosted on a static HTTP server that does no computation other than serving files when it receives requests. +Encoding a video in DASH format consists in partitioning the content into periods, adaptation sets, representations and segments as explained above, and generating a Media Presentation Description file (MPD) which describes this organisation. +Once the data is prepared, it can simply be hosted on a static HTTP server which does no computation other than serving files when it receives requests. All the intelligence and the decision making is moved to the client side. This is one of the DASH strengths: no powerful server is required, and since static HTTP server are stable and efficient, all DASH clients can benefit from it. @@ -51,7 +51,7 @@ This is one of the DASH strengths: no powerful server is required, and since sta \subsubsection{Client side adaptation} A client typically starts by downloading the MPD file, and then proceeds on downloading segments from the different adaptation sets. While the standard describes well how to structure content on the server side, the client may be freely implemented to take into account the specificities of a given application. -The most important part of any implementation of a DASH client is called the adaptation logic. This component takes into account a set of parameters, such as network conditions (bandwidth, throughput, for example), buffer states or segments size to derive a decision on which segments should be downloaded next. Most of the industrial actors have of course their own adaptation logic, and many more have been proposed in the literature. A thorough review is beyond the scope of this state-of-the-art, but examples include \citep{chiariotti2016online} who formulate the problem in a reinforcement learning framework, \citep{yadav2017quetra} who formulate the problem using Queuing theory, or \citep{huang2019hindsight} who use a formulation derived from the Knapsack problem. +The most important part of any implementation of a DASH client is called the adaptation logic. This component takes into account a set of parameters, such as network conditions (bandwidth, throughput, for example), buffer states or segments size to derive a decision on which segments should be downloaded next. Most of the industrial actors have their own adaptation logic, and many more have been proposed in the literature. A thorough review is beyond the scope of this state-of-the-art, but examples include \citep{chiariotti2016online} who formulate the problem in a reinforcement learning framework, \citep{yadav2017quetra} who formulate the problem using Queuing theory, or \citep{huang2019hindsight} who use a formulation derived from the Knapsack problem. \subsection{DASH-SRD} Being now widely adopted in the context of video streaming, DASH has been adapted to various other contexts. diff --git a/src/system-bookmarks/bookmark.tex b/src/system-bookmarks/bookmark.tex index 5f5325a..5b52c86 100644 --- a/src/system-bookmarks/bookmark.tex +++ b/src/system-bookmarks/bookmark.tex @@ -24,7 +24,7 @@ We use the gyroscope to enable a user to rotate his device to rotate the virtual We also add the possibility to rotate the camera by using touch controls. The user can touch a part of the screen to get a hold at the virtual camera, and drag the camera direction along the two screen axis. This way, the user is not forced to perform a real-world half-turn to be able to look behind or to point the device towards the sky (which can quickly become tiring) to look up. These interactions, however, only allow the user to rotate the camera but not translate it. -For this reason, we display a small joystick on the bottom-left corner of the screen that mimics the first person video games interactions and allows the user translating the camera: +For this reason, we display a small joystick on the bottom-left corner of the screen that mimics the first person video games interactions and allows the user travelling in the scene: \begin{itemize} \item moving the joystick up makes the camera move forward; \item moving the joystick down makes the camera move backwards; @@ -231,6 +231,7 @@ We include a bookmark adaptation logic such that (i) when a bookmark is hovered \SetKw{Continue}{continue} \SetKwData{Bw}{bw\_estimation} \SetKwData{Rtt}{rtt\_estimation} + \SetKwData{CurrentSegment}{segment} \SetKwData{Segment}{best\_segment} \SetKwData{Candidates}{candidates} \SetKwData{AllSegments}{all\_segments} @@ -241,7 +242,7 @@ We include a bookmark adaptation logic such that (i) when a bookmark is hovered \SetKwFunction{EstimateNetwork}{estimate\_network\_parameters} \SetKwFunction{Append}{append} - \Input{Current index $i$, time $t_i$, viewpoint $v(t_i)$, buffer of already downloaded \texttt{segments}, MPD} + \Input{Current index $i$, time $t_i$, viewpoint $v(t_i)$, buffer of already downloaded \texttt{segments}, MPD, utility metric $\mathcal{U}$, streaming polic $\Omega$} \Output{Next segment to request, updated buffer} \BlankLine{} @@ -258,9 +259,9 @@ We include a bookmark adaptation logic such that (i) when a bookmark is hovered (\Bw, \Rtt) \leftarrow{} \EstimateNetwork{}\; \BlankLine{} - \Candidates\leftarrow{} \AllSegments\newline\makebox[1cm]{}.\Filter{$\Segment\rightarrow\Segment\notin\DownloadedSegments$}\newline\makebox[1cm]{}.\Filter{$\Segment\rightarrow\Segment\in\Frustum$}\; + \Candidates\leftarrow{} \AllSegments\newline\makebox[1cm]{}.\Filter{$\CurrentSegment\rightarrow\CurrentSegment\notin\DownloadedSegments$}\newline\makebox[1cm]{}.\Filter{$\CurrentSegment\rightarrow\CurrentSegment\in\Frustum$}\; \BlankLine{} - \Segment\leftarrow{} \Argmax{\Candidates, \Segment\rightarrow{} $\Omega\left(\mathcal{U}(\Segment)\right)$}\; + \Segment\leftarrow{} \Argmax{\Candidates, \CurrentSegment\rightarrow{} $\Omega\left(\mathcal{U},\CurrentSegment\right)$}\; \DownloadedSegments.\Append{\Segment}\; \Return\Segment; {\caption{Algorithm to identify the next segment to query\label{sb:next-segment}}} diff --git a/src/system-bookmarks/main.tex b/src/system-bookmarks/main.tex index 2a10596..eb217a3 100644 --- a/src/system-bookmarks/main.tex +++ b/src/system-bookmarks/main.tex @@ -14,7 +14,7 @@ We add some widgets on the screen to support touch interactions: a virtual joyst Since most mobile devices embed a gyroscope, we allow users to rotate the camera by physically rotating the device. This interaction is more precise and intuitive to the user, but it is also more tiring, this is why we also added a touch interaction to rotate the screen: a user can also ``touch and drag'' at any point on the screen that does not correspond to the joystick to rotate the camera. In order to ease navigation, we integrate bookmarks back, and we propose an enhanced version of the precomputations explained in Chapter~\ref{sb} that we encode in the DASH Media Presentation Description. -We then present a user study on 18 participants, that evaluates how users perceive the visual quality of the scene, and how their interactions affect it. +We then present a user study on 18 participants, which evaluates how users perceive the visual quality of the scene, and how their interactions affect it. \newpage