机器学习中的优化 Optimization Chapter 2 Gradient Descent(2)
\(\large \bf{Theorem }\ 2.7:\)
\(f:\mathbb{R^d}\rightarrow\mathbb{R}\text{ be convex and differentiable with a global minimum }x^*;\text{ Suppose }f\text{ is smooth with parameter }L.\text{ Choosing stepsize: }\gamma = \frac{1}{L},\text{ gradients descent yields:}\)
\(\large\bf Proof:\)
\(f\text{ is differentiable and smooth, according to Lemma 2.6, we can get:}\)
\(\text{Therefore:}\)
\[\begin{align} \frac{1}{2L}||g_t||^2\leq f(x_t)-f(x_{t+1}) \end{align} \]\(\text{Now we sum up:}\)
\[\begin{align} \frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2&\leq \sum_{t=0}^{T-1}[f(x_t)-f(x_{t+1})]\\ &=f(x_0)-f(x_T) \end{align} \]\(\gamma = 1/L,\text{ therefore from previous analysis:}\)
\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}||x_0-x^*||^2 \end{align} \]\(\text{Combine (5) and (6):}\)
\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}||x_0-x^*||^2 \\ &\leq f(x_0)-f(x_T)+\frac{1}{2\gamma}||x_0-x^*||^2 \end{align} \]\(\text{Hence:}\)
\[\begin{align} \sum_{t=1}^{T}[f(x_t)-f(x^*)]&\leq \frac{1}{2\gamma}||x_0-x^*||^2\\ &=\frac{L}{2}||x_0-x^*||^2 \end{align} \]\(\text{As the result:}\)
\[\begin{align} T\cdot (f(x_T)-f(x^*))&\leq \sum_{t=1}^T[f(x_t)-f(x^*)]\\ &=\frac{L}{2}||x_0-x^*||^2 \end{align} \]\[\begin{align} \Rightarrow f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]1. Smooth and strongly convex function:\(O(\log(1/\epsilon))\) steps
\(\text{First-order method: only use the gradient information to minimize }f.\)
\(\large\bf Definition\ 2.9:\)
\(\text{Strongly convex function: }\)
\(\large \bf Lemma\ 2.10:\)
\(\text{if }f \text{ is strongly convex with parameter }\mu>0,\text{ then }f\text{ is }\bf{strictly\ convex\ and\ has\ a\ unique\ global\ minimum.}\)
\(\text{Assume that }f\text{ is stringly convex with }\mu,\text{ from vanilla analysis:}\)
\[\begin{align} g_t(x_t-x^*)&=\nabla f(x_t)^T(x_t-x^*)\\ &\geq f(x_t)-f(x^*)+\frac{\mu}{2}||x_t-x^*||^2 \end{align} \]\(\text{Hence:}\)
\[\begin{align} f(x_t)-f(x^*)&\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||x_{t+1}-x^*||^2]-\frac{\mu}{2}||x_t-x^*||^2 \end{align} \]\(\text{Rewrite it as:}\)
\[\begin{align} ||x_{t+1}-x^*||^2\leq 2\gamma [f(x^*)-f(x_t)]+\gamma^2||g_t||^2+(1-\mu\gamma)||x_t-x^*||^2 \end{align} \]\(\large\bf{Theorem\ 2.12:}\)
\(f:\mathbb{R^d}\rightarrow\mathbb{R}\text{ be convex and differnentiable. Suppose }f\text{ is smooth with }L,\text{ and strongly convex with }\mu. \text{ Choosing stepsize:}\)
\(\text{Gradient descent with arbitary }x_0\text{ satisfies the following two properties:}\)
\((i)\)
\(\large\bf Proof:\)
\(\text{By smooth, we know:}\)
\(\text{Combine (18), we get}\)
\[\begin{align} ||x_{t+1}-x^*||^2&\leq -\gamma^2||g_t||^2+\gamma^2||g_t||^2+(1-\mu\gamma)||x_t-x^*||^2\\ &\leq (1-\frac{\mu}{L})||x_t-x^*||^2 \end{align} \]\((ii)\)
\[\begin{align} f(x_T)-f(x^*)\leq \frac{L}{2}(1-\frac{\mu}{L})^T||x_0-x^*||^2 \end{align} \]\(\large\bf Proof:\)
\(\text{From smooth:}\)