机器学习中的优化 Optimization Chapter 2 Gradient Descent(2)


\(\large \bf{Theorem }\ 2.7:\)
\(f:\mathbb{R^d}\rightarrow\mathbb{R}\text{ be convex and differentiable with a global minimum }x^*;\text{ Suppose }f\text{ is smooth with parameter }L.\text{ Choosing stepsize: }\gamma = \frac{1}{L},\text{ gradients descent yields:}\)

\[\begin{align} f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]

\(\large\bf Proof:\)
\(f\text{ is differentiable and smooth, according to Lemma 2.6, we can get:}\)

\[\begin{align} f(x_{t+1})-f(x_t)\leq -\frac{1}{2L}||g_t||^2 \end{align} \]

\(\text{Therefore:}\)

\[\begin{align} \frac{1}{2L}||g_t||^2\leq f(x_t)-f(x_{t+1}) \end{align} \]

\(\text{Now we sum up:}\)

\[\begin{align} \frac{1}{2L}\sum_{t=0}^{T-1}||g_t||^2&\leq \sum_{t=0}^{T-1}[f(x_t)-f(x_{t+1})]\\ &=f(x_0)-f(x_T) \end{align} \]

\(\gamma = 1/L,\text{ therefore from previous analysis:}\)

\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}||x_0-x^*||^2 \end{align} \]

\(\text{Combine (5) and (6):}\)

\[\begin{align} \sum_{t=0}^{T-1}[f(x_t)-f(x^*)]&\leq \frac{\gamma}{2}\sum_{t=0}^{T-1}||g_t||^2+\frac{1}{2\gamma}||x_0-x^*||^2 \\ &\leq f(x_0)-f(x_T)+\frac{1}{2\gamma}||x_0-x^*||^2 \end{align} \]

\(\text{Hence:}\)

\[\begin{align} \sum_{t=1}^{T}[f(x_t)-f(x^*)]&\leq \frac{1}{2\gamma}||x_0-x^*||^2\\ &=\frac{L}{2}||x_0-x^*||^2 \end{align} \]

\(\text{As the result:}\)

\[\begin{align} T\cdot (f(x_T)-f(x^*))&\leq \sum_{t=1}^T[f(x_t)-f(x^*)]\\ &=\frac{L}{2}||x_0-x^*||^2 \end{align} \]

\[\begin{align} \Rightarrow f(x_T)-f(x^*)\leq \frac{L}{2T}||x_0-x^*||^2 \end{align} \]

1. Smooth and strongly convex function:\(O(\log(1/\epsilon))\) steps

\(\text{First-order method: only use the gradient information to minimize }f.\)

\(\large\bf Definition\ 2.9:\)
\(\text{Strongly convex function: }\)

\[\begin{align} f(y)\geq f(x)+\nabla f(x)^T(y-x)+\frac{L}{2}||x-y||^2 \end{align} \]

\(\large \bf Lemma\ 2.10:\)
\(\text{if }f \text{ is strongly convex with parameter }\mu>0,\text{ then }f\text{ is }\bf{strictly\ convex\ and\ has\ a\ unique\ global\ minimum.}\)

\(\text{Assume that }f\text{ is stringly convex with }\mu,\text{ from vanilla analysis:}\)

\[\begin{align} g_t(x_t-x^*)&=\nabla f(x_t)^T(x_t-x^*)\\ &\geq f(x_t)-f(x^*)+\frac{\mu}{2}||x_t-x^*||^2 \end{align} \]

\(\text{Hence:}\)

\[\begin{align} f(x_t)-f(x^*)&\leq \frac{1}{2\gamma}[\gamma^2||g_t||^2+||x_t-x^*||^2-||x_{t+1}-x^*||^2]-\frac{\mu}{2}||x_t-x^*||^2 \end{align} \]

\(\text{Rewrite it as:}\)

\[\begin{align} ||x_{t+1}-x^*||^2\leq 2\gamma [f(x^*)-f(x_t)]+\gamma^2||g_t||^2+(1-\mu\gamma)||x_t-x^*||^2 \end{align} \]

\(\large\bf{Theorem\ 2.12:}\)
\(f:\mathbb{R^d}\rightarrow\mathbb{R}\text{ be convex and differnentiable. Suppose }f\text{ is smooth with }L,\text{ and strongly convex with }\mu. \text{ Choosing stepsize:}\)

\[\begin{align} \gamma = 1/L \end{align} \]

\(\text{Gradient descent with arbitary }x_0\text{ satisfies the following two properties:}\)
\((i)\)

\[\begin{align} ||x_{t+1}-x^*||^2\leq (1-\frac{\mu}{L})||x_t-x^*||^2 \end{align} \]

\(\large\bf Proof:\)
\(\text{By smooth, we know:}\)

\[\begin{align} f(x^*)-f(x_t)\leq f(x_{t+1})-f(x_t)\leq -\frac{1}{2L}||g_t||^2 \end{align} \]

\(\text{Combine (18), we get}\)

\[\begin{align} ||x_{t+1}-x^*||^2&\leq -\gamma^2||g_t||^2+\gamma^2||g_t||^2+(1-\mu\gamma)||x_t-x^*||^2\\ &\leq (1-\frac{\mu}{L})||x_t-x^*||^2 \end{align} \]

\((ii)\)

\[\begin{align} f(x_T)-f(x^*)\leq \frac{L}{2}(1-\frac{\mu}{L})^T||x_0-x^*||^2 \end{align} \]

\(\large\bf Proof:\)
\(\text{From smooth:}\)

\[\begin{align} f(x_t)\leq f(x^*)+\frac{L}{2}||x_t-x^*||^2 \end{align} \]

\[\begin{align} \Rightarrow f(x_T)-f(x^*)&\leq \frac{L}{2}||x_T-x^*||^2\\ &\leq ... \leq \frac{L}{2}(1-\frac{\mu}{L})^T||x_0-x^*||^2 \end{align} \]