机器学习中的优化 Optimization Chapter 3 Projected Gradient Descent(1)
1. Projected Gradient Descent
\[\begin{align} y_{t+1} &= x_t-\gamma \nabla f(x_t)\\ x_{t+1} &= \prod_X(y_{t+1}) = \arg\min_{x}||x-y_{t+1}||^2 \end{align} \]\({\large \textbf{Theorem 3.1}:} X \text{ be closed and convex}\)
\[\begin{align} &(i) (x-\prod_X(y))^T(y-\prod_X(y))\leq 0\\ &(ii)||x-\prod_X(y)||^2+||y-\prod_X(y)||^2\leq ||x-y||^2 \end{align} \]2. Bounded gradients: \(O(1/\epsilon^2)\) steps
\({\large \textbf{Theorem 3.2}:} f:dom(f)\rightarrow\mathbb{R}\text{ be convex and differentiable, }X\in dom(f) \text{ be convex and closed. }x^*\text{ is the minimizer. Suppose }||x_0-x^*||^2\leq R, ||\nabla f(x)||\leq B \text{ for all }x\in X.\text{ Choosing the stepsize:}\)
\[\begin{align} \gamma = \frac{R}{B\sqrt{T}} \end{align} \]\(\text{Projected gradient descent yields:}\)
\[\frac{1}{T}\sum_{t=0}^{T-1}(f(x_t)-f(x^*))\leq \frac{RB}{\sqrt{T}} \]\(\large\textbf{Proof:}\)
\(\text{From }(1) \text{ and previous vanilla analysis:}\)
\(\text{From (4): let }x=x^*,y=y_{t+1}\)
\[\begin{align} ||x^*-\prod_X(y_{t+1})||^2+||y_{t+1}-\prod_X(y_{t+1})||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]\(\text{i.e.}\)
\[\begin{align} ||x^*-x_{t+1}||^2+||y_{t+1}-x_{t+1}||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]\(\text{Then we drop the second term:}\)
\[\begin{align} ||x^*-x_{t+1}||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]\(\text{Hence we get from (6):}\)
\[\begin{align} g_t^T(x_t-x^*)&=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2]\\ &\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||y_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]\(\text{From convexity: }f(x^*)\geq f(x_t)+g_t^T(x^*-x_t),\text{ hence:}\)
\[\begin{align} f(x_t)-f(x^*)&\leq g_t^T(x_t-x^*)\\ &\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||y_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]\(\text{Finally:}\)
\[\begin{align} \frac{1}{T}\sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}[||y_0-x^*||^2-||y_T-x^*||^2]\\ &\leq \frac{\gamma}{2}TB^2+\frac{1}{2\gamma}R^2 \end{align} \]3. Smooth convex functions: \(O(1/\epsilon)\) steps
\({\large \textbf{Lemma 3.3}: } f:dom(f)\rightarrow \mathbb{R}\text{ be differentiable and smooth with }L.\text{ Choosing stepsize: }\gamma = \frac{1}{L}\)
\(\text{Projected gradient descent satisfies:}\)
\(\large\textbf{Proof:}\)
\(\text{From smoothness:}\)
\({\large \textbf{Theorem 3.4}:}f:dom(f)\rightarrow \mathbb{R}\text{ be convex and differentiable. Suppose }f\text{ is smooth with parameter }L. \text{ Choosing stepsize: }\gamma = \frac{1}{L}.\text{ Projected gradient descent satifies:}\)
\[\begin{align} f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]\(\large \textbf{Proof:}\)
\(\text{From (16)}:\)
\(\text{Go back to }\)
\[\begin{align} g_t^T(x_t-x^*)=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]\(\text{and}\)
\[\begin{align} ||x^*-x_{t+1}||^2+||y_{t+1}-x_{t+1}||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]\(\text{Previously, we just drop the second term, now we keep it:}\)
\[\begin{align} g_t^T(x_t-x^*)&=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2]\\ &\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||x_{t+1}-x^*||^2-||y_{t+1}-x_{t+1}||^2] \end{align} \]\(\text{From convexity: }f(x_t)-f(x^*)\leq g_t^T(x_t-x^*)\)
\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \sum_{t=0}^{T-1}g_t^T(x_t-x^*)\\ &\leq \frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2+\frac{L}{2}||x_0-x^*||^2-\frac{L}{2}\sum_{t=0}^{T-1}||y_{t+1}-x_{t+1}||^2 \end{align} \]\(\text{Now take the upper bound:}\)
\[\begin{align} \frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2&\leq \sum_{t=0}^{T-1}[f(x_t)-f(x_{t+1})+\frac{L}{2}||y_{t+1}-x_{t+1}||^2]\\ &=f(x_0)-f(x_T)+\frac{L}{2}\sum_{t=0}^{T-1}||y_{t+1}-x_{t+1}||^2 \end{align} \]\(\text{Finally, combine (29) and (27):}\)
\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq f(x_0)-f(x_T)+\frac{L}{2}||x_0-x^*||^2 \end{align} \]\(\text{Hence:}\)
\[\begin{align} f(x_T)-f(x^*)\leq \frac{1}{T}\sum_{t=1}^{T}[f(x_t)-f(x^*)]\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]