{ "cells": [ { "cell_type": "markdown", "id": "94f0a60c", "metadata": {}, "source": [ "# Регрессия" ] }, { "cell_type": "code", "execution_count": 100, "id": "8aa79ab0", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from matplotlib import pyplot as plt\n", "import seaborn as sns\n", "plt.rcParams.update({\"font.size\": 16})\n", "\n", "\n", "def generate_problem(function, a, b, randomize_x=False, noise=None, n=50): \n", " x = np.random.uniform(low=a, high=b, size=n) if randomize_x else np.linspace(a, b, n)\n", " y = function(x)\n", " if noise:\n", " y += np.random.normal(loc=0., scale=noise, size=n)\n", " return pd.DataFrame({\n", " \"x\": x,\n", " \"y\": y\n", " })\n", "\n", "def plot_regression_problem(ax, data, regressor=None, test_data=None):\n", " # dataset column\n", " if test_data is None:\n", " data = data.copy()\n", " data[\"dataset\"] = \"train\"\n", " else:\n", " data = pd.concat([data, test_data], keys=[\"train\", \"test\"])\n", " data = data.reset_index(level=0).rename(columns={\"level_0\": \"dataset\"})\n", " \n", " # scatter\n", " ax.set_ylim([\n", " data[\"y\"].min() - 0.1,\n", " data[\"y\"].max() + 0.1\n", " ])\n", " sns.scatterplot(data=data, x=\"x\", y=\"y\", hue=\"dataset\", ax=ax)\n", " \n", " # regressor\n", " if regressor: \n", " x_min, x_max = data[\"x\"].min(), data[\"x\"].max()\n", " x = np.linspace(x_min, x_max, 100)\n", " y = regressor(x)\n", " sns.lineplot(x=x, y=y, color=\"green\", ax=ax)" ] }, { "cell_type": "markdown", "id": "c98edb4d", "metadata": {}, "source": [ "## Аппроксимация полиномами\n", "\n", "Рассмотрим задачу аппроксимации неизвестной функции одного аргумента. Пусть имеется набор из $N$ точек $(x_i, y_i) \\in\\mathbb{R}, \\, i=1,\\ldots, N$ и перед нами стоит цель найти функцию $f\\colon \\mathbb{R}\\to\\mathbb{R}$, которая бы хорошо аппроксимировала имеющиеся точки. Для этого можно поставить задачу поиска функции $f$ из класса $F$ с наименьшей среднеквадратичной ошибкой\n", "\n", "$$\n", "\\dfrac1N\\sum_{i=1}^N(f(x_i) - y_i)^2 \\sim \\min\\limits_{f\\in F}.\n", "$$\n", "\n", "В ячейке ниже генерируется набор таких точек.\n", "- значения $x_i$ генерируются на равномерной сетке на отрезке $[0, 4]$;\n", "- значения $y_i$ генерируются в качестве значения полинома $y_i = f(x_i) = 1 - \\dfrac{x_i^2}{2!} + \\dfrac{x_i^4}{4!}$." ] }, { "cell_type": "code", "execution_count": 74, "id": "b2afc2e8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | x | \n", "y | \n", "dataset | \n", "
---|---|---|---|
0 | \n", "0.00 | \n", "1.00 | \n", "train | \n", "
1 | \n", "0.08 | \n", "1.00 | \n", "train | \n", "
2 | \n", "0.16 | \n", "0.99 | \n", "train | \n", "
3 | \n", "0.24 | \n", "0.97 | \n", "train | \n", "
4 | \n", "0.33 | \n", "0.95 | \n", "train | \n", "