{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "## Решение дает 12 баллов из 15 при отправке в систему проверки stepik.\n",
        "\n"
      ],
      "metadata": {
        "id": "v_eGuOtAv_9G"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Yk-K_mHMs7ns",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "503662e2-b124-4f83-9c61-80ff1a412d8f"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.3/268.3 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m817.7/817.7 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m46.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.3/163.3 kB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.0/27.0 MB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.5/287.5 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.0/113.0 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m926.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.8/144.8 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ],
      "source": [
        "!pip install openai langchain tiktoken langchain-openai langchain-community sentence_transformers faiss-cpu -q"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!wget https://raw.githubusercontent.com/a-milenkin/LLM_practical_course/main/notebooks/utils.py"
      ],
      "metadata": {
        "id": "Keq0WAjLuPgJ",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "8fb2fde7-0152-4fc4-f52c-4406df5e548c"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2024-04-15 15:41:31--  https://raw.githubusercontent.com/a-milenkin/LLM_practical_course/main/notebooks/utils.py\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 10823 (11K) [text/plain]\n",
            "Saving to: ‘utils.py’\n",
            "\n",
            "\rutils.py              0%[                    ]       0  --.-KB/s               \rutils.py            100%[===================>]  10.57K  --.-KB/s    in 0s      \n",
            "\n",
            "2024-04-15 15:41:31 (65.0 MB/s) - ‘utils.py’ saved [10823/10823]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from utils import ChatOpenAI\n",
        "from getpass import getpass\n",
        "\n",
        "#course_api_key= \"Введите ваш API ключ с курса\"\n",
        "course_api_key = getpass(prompt='API key')\n",
        "\n",
        "# Инициализируем языковую модель\n",
        "llm = ChatOpenAI(temperature=0.0, course_api_key=course_api_key)"
      ],
      "metadata": {
        "id": "PCS_WedHtPRg",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "db7db222-e7a3-408b-adf6-6702dfa05e46"
      },
      "execution_count": null,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "API key··········\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Загружаем файлы"
      ],
      "metadata": {
        "id": "0eRICp2Lq8s-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!wget https://stepik.org/media/attachments/lesson/1084288/The_Daughter_of_The_Commandant.pdf"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VWFprDaxqy9Y",
        "outputId": "cb74ccae-a601-40c8-f7fa-2169886236a5"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2024-04-15 15:46:10--  https://stepik.org/media/attachments/lesson/1084288/The_Daughter_of_The_Commandant.pdf\n",
            "Resolving stepik.org (stepik.org)... 178.248.239.111\n",
            "Connecting to stepik.org (stepik.org)|178.248.239.111|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 198595 (194K) [application/pdf]\n",
            "Saving to: ‘The_Daughter_of_The_Commandant.pdf’\n",
            "\n",
            "The_Daughter_of_The 100%[===================>] 193.94K   444KB/s    in 0.4s    \n",
            "\n",
            "2024-04-15 15:46:12 (444 KB/s) - ‘The_Daughter_of_The_Commandant.pdf’ saved [198595/198595]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "from tqdm import tqdm\n",
        "\n",
        "df = pd.read_csv('https://stepik.org/media/attachments/lesson/1084288/pushkin_questions.csv')"
      ],
      "metadata": {
        "id": "dBAX9Ldxq3uC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Соберем все компоненты для RAG"
      ],
      "metadata": {
        "id": "gjrto68msT-A"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "\n",
        "<center> <img src='https://github.com/a-milenkin/LLM_practical_course/blob/main/images/RAG.png?raw=1' width=\"800\" height=\"250\">\n"
      ],
      "metadata": {
        "id": "zDg0bWUhr3dz"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install pypdf -q"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uKIFOOMF4dpR",
        "outputId": "dbef3bd2-719d-46b4-e1af-df1dcd931e63"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/290.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/290.4 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.9/290.4 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m286.7/290.4 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Определите Document loader для pdf\n",
        "from langchain_community.document_loaders import PyPDFLoader\n",
        "\n",
        "loader = PyPDFLoader(\"/content/The_Daughter_of_The_Commandant.pdf\")\n",
        "docs = loader.load() # метод load сразу разбивает текст на документы по страницам"
      ],
      "metadata": {
        "id": "WOwhANxRr3Ik"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Следующие две ячейки можно пропустить, если нас устраивает сплит в методе load"
      ],
      "metadata": {
        "id": "WmsW0czZYBV8"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "docs = [docs[i].page_content for i in range(len(docs))] # собираем текст, если хотим использовать свой сплитер"
      ],
      "metadata": {
        "id": "P5d-XyjimVxb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Определите подходящий Splitter\n",
        "from langchain.text_splitter import (\n",
        "    CharacterTextSplitter,\n",
        "    RecursiveCharacterTextSplitter,\n",
        ")\n",
        "\n",
        "splitter = CharacterTextSplitter(\n",
        "    separator=\"\\n\\n\",  # символ-разделитель, по умолчанию переход к новому абзацу '\\n\\n'\n",
        "    chunk_size=500,  # размер документа в символах\n",
        "    chunk_overlap=100,  # насколько соседние документы могут перекрывать друг-друга\n",
        "    length_function=len,  # функция, по которой считается размер документа\n",
        "    is_separator_regex=False,  # является ли разделитель регулярным выражением\n",
        ")\n",
        "\n",
        "\n",
        "split_documents = splitter.create_documents(docs)\n",
        "len(split_documents)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "g8kNIzR8h3i-",
        "outputId": "d8c6a59b-e0e3-4a45-dd4c-988f077705f7"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "4"
            ]
          },
          "metadata": {},
          "execution_count": 38
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Выбираем Embedding models"
      ],
      "metadata": {
        "id": "IClLxZZytsQ0"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 🔱 Эмбеддинги от OpenAI (API)"
      ],
      "metadata": {
        "id": "QNFHkIi9t4JE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Если используете ключ курса, запустите эту ячейку\n",
        "from utils import OpenAIEmbeddings\n",
        "\n",
        "embeddings_api_model = OpenAIEmbeddings(course_api_key=course_api_key)"
      ],
      "metadata": {
        "id": "a3loV8ySuB7e"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Выбираем 🗂 Vector Store"
      ],
      "metadata": {
        "id": "mxBZ5FBMut1j"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.vectorstores import FAISS\n",
        "\n",
        "embeddings = OpenAIEmbeddings(course_api_key=course_api_key)\n",
        "db = FAISS.from_documents(\n",
        "    split_documents[:4], embeddings\n",
        ")\n",
        "\n",
        "db.save_local(\"faiss_db\")  # можно сохранить базу локально, указав путь"
      ],
      "metadata": {
        "id": "hRKdQT8QuPPT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Определим 🎣 Retriever"
      ],
      "metadata": {
        "id": "bHeBYyV-vPmu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Самый частый кейс - использование векторного хранилища и его методов для получения документов\n",
        "retriever = db.as_retriever(\n",
        "    search_type=\"similarity\",  # тип поиска похожих документов\n",
        "    k=4,  # количество возвращаемых документов (Default: 4)\n",
        "    score_threshold=None,  # минимальный порог для поиска \"similarity_score_threshold\"\n",
        ")"
      ],
      "metadata": {
        "id": "pcljBDDBvB2s"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 🚰 RAG Pipeline - подключаем RAG к LLM"
      ],
      "metadata": {
        "id": "YzCDWTNdvy_J"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.schema import StrOutputParser\n",
        "from langchain_core.prompts import ChatPromptTemplate\n",
        "from langchain_core.runnables import RunnablePassthrough\n",
        "\n",
        "# Создаём простой шаблон\n",
        "template = \"\"\"\n",
        "Answer the question based only on the following context:\n",
        "\n",
        "{context}\n",
        "\n",
        "Question: {question}\n",
        "\"\"\"\n",
        "# Создаём промпт из шаблона\n",
        "prompt = ChatPromptTemplate.from_template(template)\n",
        "\n",
        "\n",
        "# Объявляем функцию, которая будет собирать строку из полученных документов\n",
        "def format_docs(docs):\n",
        "    return \"\\n\\n\".join([d.page_content for d in docs])\n",
        "\n",
        "\n",
        "# Создаём цепочку\n",
        "chain = (\n",
        "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
        "    | prompt\n",
        "    | llm\n",
        "    | StrOutputParser()\n",
        ")"
      ],
      "metadata": {
        "id": "119g8AIqvB0x"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "answers = [] # Список, где будем хранить ответы модели\n",
        "\n",
        "for text_input in tqdm(df['question']):\n",
        "    answer = chain.invoke(text_input)\n",
        "    answers.append(answer) # Добавляем ответ в список\n",
        "    break # Для отладки. Уберите, когда убедитесь, что на одном примере работает"
      ],
      "metadata": {
        "id": "z5b0Ii9kvBx2",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "5a0e6f74-ecda-4100-9594-e3f87d2e3e1b"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "  0%|          | 0/15 [00:02<?, ?it/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df['answer'] = answers # Создаём новый столбец из ответов модели"
      ],
      "metadata": {
        "id": "ks63JYu1vBpt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df.to_csv('4_1_11_solution.csv', index=False) # Сохраняем файл, отправляем на Stepik, получаем баллы :)"
      ],
      "metadata": {
        "id": "FvJ8XyUQwrUK"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}