机器学习中的优化 Optimization Chapter 3 Projected Gradient Descent(1)

1. Projected Gradient Descent

\[\begin{align} y_{t+1} &= x_t-\gamma \nabla f(x_t)\\ x_{t+1} &= \prod_X(y_{t+1}) = \arg\min_{x}||x-y_{t+1}||^2 \end{align} \]

\({\large \textbf{Theorem 3.1}:} X \text{ be closed and convex}\)

\[\begin{align} &(i) (x-\prod_X(y))^T(y-\prod_X(y))\leq 0\\ &(ii)||x-\prod_X(y)||^2+||y-\prod_X(y)||^2\leq ||x-y||^2 \end{align} \]

2. Bounded gradients: \(O(1/\epsilon^2)\) steps

\({\large \textbf{Theorem 3.2}:} f:dom(f)\rightarrow\mathbb{R}\text{ be convex and differentiable, }X\in dom(f) \text{ be convex and closed. }x^*\text{ is the minimizer. Suppose }||x_0-x^*||^2\leq R, ||\nabla f(x)||\leq B \text{ for all }x\in X.\text{ Choosing the stepsize:}\)

\[\begin{align} \gamma = \frac{R}{B\sqrt{T}} \end{align} \]

\(\text{Projected gradient descent yields:}\)

\[\frac{1}{T}\sum_{t=0}^{T-1}(f(x_t)-f(x^*))\leq \frac{RB}{\sqrt{T}} \]

\(\large\textbf{Proof:}\)
\(\text{From }(1) \text{ and previous vanilla analysis:}\)

\[\begin{align} g_t^T(x_t-x^*)&=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]

\(\text{From (4): let }x=x^*,y=y_{t+1}\)

\[\begin{align} ||x^*-\prod_X(y_{t+1})||^2+||y_{t+1}-\prod_X(y_{t+1})||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]

\(\text{i.e.}\)

\[\begin{align} ||x^*-x_{t+1}||^2+||y_{t+1}-x_{t+1}||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]

\(\text{Then we drop the second term:}\)

\[\begin{align} ||x^*-x_{t+1}||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]

\(\text{Hence we get from (6):}\)

\[\begin{align} g_t^T(x_t-x^*)&=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2]\\ &\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||y_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]

\(\text{From convexity: }f(x^*)\geq f(x_t)+g_t^T(x^*-x_t),\text{ hence:}\)

\[\begin{align} f(x_t)-f(x^*)&\leq g_t^T(x_t-x^*)\\ &\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||y_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]

\(\text{Finally:}\)

\[\begin{align} \frac{1}{T}\sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}[||y_0-x^*||^2-||y_T-x^*||^2]\\ &\leq \frac{\gamma}{2}TB^2+\frac{1}{2\gamma}R^2 \end{align} \]

3. Smooth convex functions: \(O(1/\epsilon)\) steps

\({\large \textbf{Lemma 3.3}: } f:dom(f)\rightarrow \mathbb{R}\text{ be differentiable and smooth with }L.\text{ Choosing stepsize: }\gamma = \frac{1}{L}\)
\(\text{Projected gradient descent satisfies:}\)

\[\begin{align} f(x_{t+1})\leq f(x_t)-\frac{1}{2L}||g_t||^2+\frac{L}{2}||y_{t+1}-x_{t+1}||^2 \end{align} \]

\(\large\textbf{Proof:}\)
\(\text{From smoothness:}\)

\[\begin{align} f(x_{t+1})&\leq f(x_t)+g_t^T(x_{t+1}-x_{t})+\frac{L}{2}||x_t-x_{t+1}||^2\\ &= f(x_t)-\frac{1}{2L}||g_t||^2-\frac{L}{2}(||x_{t}-x_{t+1}||^2-||y_{t+1}-x_{t+1}||^2)+\frac{L}{2}||x_t-x_{t+1}||^2\\ &\leq f(x_t)-\frac{1}{2L}||g_t||^2+\frac{L}{2}||y_{t+1}-x_{t+1}||^2 \end{align} \]

\({\large \textbf{Theorem 3.4}:}f:dom(f)\rightarrow \mathbb{R}\text{ be convex and differentiable. Suppose }f\text{ is smooth with parameter }L. \text{ Choosing stepsize: }\gamma = \frac{1}{L}.\text{ Projected gradient descent satifies:}\)

\[\begin{align} f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]

\(\large \textbf{Proof:}\)
\(\text{From (16)}:\)

\[\begin{align} \frac{1}{2L}||g_t||^2\leq f(x_t)-f(x_{t+1})+\frac{L}{2}||y_{t+1}-x_{t+1}||^2 \end{align} \]

\(\text{Go back to }\)

\[\begin{align} g_t^T(x_t-x^*)=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2] \end{align} \]

\(\text{and}\)

\[\begin{align} ||x^*-x_{t+1}||^2+||y_{t+1}-x_{t+1}||^2\leq ||x^*-y_{t+1}||^2 \end{align} \]

\(\text{Previously, we just drop the second term, now we keep it:}\)

\[\begin{align} g_t^T(x_t-x^*)&=\frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||y_{t+1}-x^*||^2]\\ &\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||x_{t+1}-x^*||^2-||y_{t+1}-x_{t+1}||^2] \end{align} \]

\(\text{From convexity: }f(x_t)-f(x^*)\leq g_t^T(x_t-x^*)\)

\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \sum_{t=0}^{T-1}g_t^T(x_t-x^*)\\ &\leq \frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2+\frac{L}{2}||x_0-x^*||^2-\frac{L}{2}\sum_{t=0}^{T-1}||y_{t+1}-x_{t+1}||^2 \end{align} \]

\(\text{Now take the upper bound:}\)

\[\begin{align} \frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2&\leq \sum_{t=0}^{T-1}[f(x_t)-f(x_{t+1})+\frac{L}{2}||y_{t+1}-x_{t+1}||^2]\\ &=f(x_0)-f(x_T)+\frac{L}{2}\sum_{t=0}^{T-1}||y_{t+1}-x_{t+1}||^2 \end{align} \]

\(\text{Finally, combine (29) and (27):}\)

\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq f(x_0)-f(x_T)+\frac{L}{2}||x_0-x^*||^2 \end{align} \]

\(\text{Hence:}\)

\[\begin{align} f(x_T)-f(x^*)\leq \frac{1}{T}\sum_{t=1}^{T}[f(x_t)-f(x^*)]\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]

Machine-Learning Optimization 优化

机器学习中的优化 Optimization Chapter 3 Projected Gradient Descent(1)

1. Projected Gradient Descent

2. Bounded gradients: \(O(1/\epsilon^2)\) steps

3. Smooth convex functions: \(O(1/\epsilon)\) steps

相关

C#优化递归树形

标签