{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "CoofhZcDtp6R"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Ноутбук для решения задач из 4 модуля\n",
        "\n"
      ],
      "metadata": {
        "id": "4uF_KywH0yV9"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Yk-K_mHMs7ns"
      },
      "outputs": [],
      "source": [
        "!pip install langchain langchain-community huggingface_hub langchain-openai openai faiss-gpu langchainhub -q"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Если используете ключ от OpenAI, запустите эту ячейку 👇\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "CoofhZcDtp6R"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain_openai import ChatOpenAI\n",
        "import os\n",
        "from getpass import getpass\n",
        "\n",
        "\n",
        "# os.environ['OPENAI_API_KEY'] = \"Введите ваш OpenAI API ключ\"\n",
        "os.environ['OPENAI_API_KEY'] = getpass(prompt='Введите ваш OpenAI API ключ')\n",
        "\n",
        "# Инициализируем языковую модель\n",
        "llm = ChatOpenAI(temperature=0.0)"
      ],
      "metadata": {
        "id": "k48wWXnPtKa2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Если используете ключ из курса, запустите эти ячейки 👇\n"
      ],
      "metadata": {
        "id": "DiQqMwQJt8rP"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!wget https://raw.githubusercontent.com/a-milenkin/LLM_practical_course/main/notebooks/utils.py"
      ],
      "metadata": {
        "id": "Keq0WAjLuPgJ",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "24dbc92d-9a45-4418-fd83-0d5773f7d4cc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2024-05-08 19:19:05--  https://raw.githubusercontent.com/a-milenkin/LLM_practical_course/main/notebooks/utils.py\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 11184 (11K) [text/plain]\n",
            "Saving to: ‘utils.py.1’\n",
            "\n",
            "\rutils.py.1            0%[                    ]       0  --.-KB/s               \rutils.py.1          100%[===================>]  10.92K  --.-KB/s    in 0s      \n",
            "\n",
            "2024-05-08 19:19:05 (87.8 MB/s) - ‘utils.py.1’ saved [11184/11184]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from utils import ChatOpenAI\n",
        "from getpass import getpass\n",
        "\n",
        "#course_api_key= \"Введите ваш API ключ с курса\"\n",
        "course_api_key = getpass(prompt='Введите API ключ')\n",
        "\n",
        "# Инициализируем языковую модель\n",
        "llm = ChatOpenAI(temperature=0.0, course_api_key=course_api_key)"
      ],
      "metadata": {
        "id": "PCS_WedHtPRg",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "301d145c-b728-463c-8210-e0ac23e74f8e"
      },
      "execution_count": null,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Введите API ключ··········\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Задание из стэпа 6.🪛 Openai tools Agent + RAG 🦞"
      ],
      "metadata": {
        "id": "Z9uleJTbu6Ui"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Распарсив веб страницу, создадим базу знаний"
      ],
      "metadata": {
        "id": "6F85KuvzBbFr"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.document_loaders import WebBaseLoader"
      ],
      "metadata": {
        "id": "92DVpeI4lCW3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "loader = WebBaseLoader(\"https://ru.wikipedia.org/wiki/%D0%93%D0%B0%D0%BD%D0%BD%D0%B8%D0%B1%D0%B0%D0%BB,_%D0%90%D0%B1%D1%80%D0%B0%D0%BC_%D0%9F%D0%B5%D1%82%D1%80%D0%BE%D0%B2%D0%B8%D1%87\")\n",
        "data = loader.load()"
      ],
      "metadata": {
        "id": "C7gtNBhuS5iW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Засплитим данные, сформируем эмбединги и создадим ретривер\n"
      ],
      "metadata": {
        "id": "LZeBiibpBh0D"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain_community.vectorstores import FAISS\n",
        "from langchain_text_splitters import CharacterTextSplitter\n",
        "from utils import OpenAIEmbeddings\n",
        "\n",
        "text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=100)\n",
        "texts = text_splitter.split_documents(data)\n",
        "embeddings = OpenAIEmbeddings(course_api_key=course_api_key)\n",
        "db_embed = FAISS.from_documents(texts, embeddings)\n",
        "\n",
        "retriever = db_embed.as_retriever()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jWVZRh84S5cr",
        "outputId": "dd94fa69-9ccc-4b15-b275-f230ce086ebe"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:langchain_text_splitters.base:Created a chunk of size 3163, which is longer than the specified 2000\n",
            "WARNING:langchain_text_splitters.base:Created a chunk of size 2589, which is longer than the specified 2000\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Инструменты для агента"
      ],
      "metadata": {
        "id": "bD7rA6g9BkgZ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.tools.retriever import create_retriever_tool\n",
        "\n",
        "tool = create_retriever_tool(\n",
        "    retriever, # наш ретривер\n",
        "    \"search_web\", # имя инструмента\n",
        "    \"Searches and returns data from page\", # описание инструмента подается в ЛЛМ\n",
        ")\n",
        "tools = [tool]"
      ],
      "metadata": {
        "id": "KB2nJbhrS5aM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain import hub\n",
        "\n",
        "prompt = hub.pull(\"hwchase17/openai-tools-agent\")\n",
        "\n",
        "prompt"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "E3sQK4z_S5Xm",
        "outputId": "c48c4db6-4783-4809-eac4-fbd33a5e2b84"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'openai-tools-agent', 'lc_hub_commit_hash': 'c18672812789a3b9697656dd539edf0120285dcae36396d0b548ae42a4ed66f5'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')), MessagesPlaceholder(variable_name='chat_history', optional=True), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), MessagesPlaceholder(variable_name='agent_scratchpad')])"
            ]
          },
          "metadata": {},
          "execution_count": 138
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain_core.prompts.prompt import PromptTemplate\n",
        "from langchain_core.prompts.chat import MessagesPlaceholder\n",
        "from langchain_core.prompts.chat import HumanMessagePromptTemplate"
      ],
      "metadata": {
        "id": "y8bOXXRyuWsH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "prompt.messages = [SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='brief short answer, very important: less than 5 words in final answer')), MessagesPlaceholder(variable_name='chat_history', optional=True), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), MessagesPlaceholder(variable_name='agent_scratchpad')]"
      ],
      "metadata": {
        "id": "Sqpq7jc2uDBW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "prompt"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zILnFVfuutFu",
        "outputId": "07225c79-7aaf-43dd-bc98-147a524fb234"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'openai-tools-agent', 'lc_hub_commit_hash': 'c18672812789a3b9697656dd539edf0120285dcae36396d0b548ae42a4ed66f5'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='brief short answer, very important: less than 5 words in final answer')), MessagesPlaceholder(variable_name='chat_history', optional=True), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), MessagesPlaceholder(variable_name='agent_scratchpad')])"
            ]
          },
          "metadata": {},
          "execution_count": 206
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Подключаем агента"
      ],
      "metadata": {
        "id": "XyKK6v0HBo2Q"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.agents import create_openai_tools_agent\n",
        "from langchain.agents.agent import AgentExecutor\n",
        "\n",
        "agent = create_openai_tools_agent(llm, tools, prompt)\n",
        "agent_executor = AgentExecutor(agent=agent, tools=tools)"
      ],
      "metadata": {
        "id": "nNVoBzwIS5VA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "from tqdm import tqdm"
      ],
      "metadata": {
        "id": "wqKxDBhRdzr-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Загружаем датасет с вопросами про Ганнибала\n",
        "df = pd.read_csv(\"https://stepik.org/media/attachments/lesson/1107866/gannibal.csv\")"
      ],
      "metadata": {
        "id": "a7VFhJoNepCy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "agent_executor"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nrrRBuM_r378",
        "outputId": "3d823660-3fcc-4d4a-bb62-4c73533ccda6"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "AgentExecutor(agent=RunnableMultiActionAgent(runnable=RunnableAssign(mapper={\n",
              "  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))\n",
              "})\n",
              "| ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'openai-tools-agent', 'lc_hub_commit_hash': 'c18672812789a3b9697656dd539edf0120285dcae36396d0b548ae42a4ed66f5'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='meow')), MessagesPlaceholder(variable_name='chat_history', optional=True), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), MessagesPlaceholder(variable_name='agent_scratchpad')])\n",
              "| RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7f912ed22470>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7f912ee40580>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy=''), kwargs={'tools': [{'type': 'function', 'function': {'name': 'search_web', 'description': 'Searches and returns data from page', 'parameters': {'type': 'object', 'properties': {'query': {'description': 'query to look up in retriever', 'type': 'string'}}, 'required': ['query']}}}]})\n",
              "| OpenAIToolsAgentOutputParser(), input_keys_arg=[], return_keys_arg=[], stream_runnable=True), tools=[Tool(name='search_web', description='Searches and returns data from page', args_schema=<class 'langchain_core.tools.RetrieverInput'>, func=functools.partial(<function _get_relevant_documents at 0x7f915185ae60>, retriever=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f912e4a0340>), document_prompt=PromptTemplate(input_variables=['page_content'], template='{page_content}'), document_separator='\\n\\n'), coroutine=functools.partial(<function _aget_relevant_documents at 0x7f915185b010>, retriever=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f912e4a0340>), document_prompt=PromptTemplate(input_variables=['page_content'], template='{page_content}'), document_separator='\\n\\n'))])"
            ]
          },
          "metadata": {},
          "execution_count": 189
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "answers = [] # Список, где будем хранить ответы модели\n",
        "\n",
        "for text_input in tqdm(df['question']):\n",
        "    answer = agent_executor.invoke({'input': text_input})\n",
        "    answers.append(answer['output']) # Добавляем ответ в список\n",
        "    #break # Для отладки. Уберите, когда убедитесь, что на одном примере работает"
      ],
      "metadata": {
        "id": "z5b0Ii9kvBx2",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "c6fe263a-e757-423f-de8b-7dd8a849eea8"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 10/10 [00:27<00:00,  2.72s/it]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "answers"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EVXglNw2fwGd",
        "outputId": "8bc8bb7b-a23a-43b6-c4ac-a4979b1460bb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['Евдокия Андреевна Диопер',\n",
              " 'Да',\n",
              " '84 года',\n",
              " '1727',\n",
              " 'В голову.',\n",
              " '100 рублей в год.',\n",
              " 'Со Швецией.',\n",
              " 'генерал-аншеф',\n",
              " 'Христина-Регина фон Шёберг',\n",
              " '4 детей']"
            ]
          },
          "metadata": {},
          "execution_count": 209
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df['answer'] = answers # Создаём новый столбец из ответов модели"
      ],
      "metadata": {
        "id": "ks63JYu1vBpt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df.to_csv('step6_solution.csv', index=False) # Сохраняем файл, отправляем на Stepik, получаем баллы :)"
      ],
      "metadata": {
        "id": "FvJ8XyUQwrUK"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}