Commit 7cfbf065 authored by Brian McMahan's avatar Brian McMahan
Browse files

the big push

parent 1bc1acd1
FROM jupyter/<notebook>:<version hash>
FROM jupyter/minimal-notebook:4cdbc9cdb7d1
#Set the working directory
WORKDIR /home/jovyan/
......@@ -6,11 +6,16 @@ WORKDIR /home/jovyan/
# Modules
COPY requirements.txt /home/jovyan/requirements.txt
RUN pip install -r /home/jovyan/requirements.txt
RUN pip install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp37-cp37m-linux_x86_64.whl
RUN pip install torchvision
RUN jupyter nbextension enable --py widgetsnbextension
# Add files
COPY notebooks /home/jovyan/notebooks
COPY day_1 /home/jovyan/day_1
COPY day_2 /home/jovyan/day_2
COPY data /home/jovyan/data
COPY solutions /home/jovyan/solutions
COPY modelzoo /home/jovyan/modelzoo
# Allow user to write to directory
USER root
......
# ${COURSE TITLE GOES HERE}
# O'Reilly Artificial Intelligence Conference
![Powered by Jupyter Logo](https://cdn.oreillystatic.com/images/icons/powered_by_jupyter.png)
## Natural Language Processing with Deep Learning training
This project contains the Jupyter Notebooks and the associated Dockerfile for ${INSTRUCTOR} _${TITLE}_. It contains both the exercises (/notebooks) and the solutions (/solutions), as well as any data or files needed (/data).
#### Delip Rao, Brian McMahan
This is a public repository so there is no need to create an account to download its contents. To download the source code from this page, click the 'Cloud' icon on the top right hand, above where the latest commit is detailed.
To download via git from your preferred terminal application, type:
## Docker Instructions
```git clone https://resources.oreilly.com/live-training/${COMPLETE URL}```
This is the Dockerfile creation repository.
## Running Jupyter Locally via Docker
Start from the root directory which contains the Dockerfile,
please run the following commands (replacing LOCALPORT with whatever you'd like)
We've shared the same Dockerfile we use for our JupyterHub session in this repository, to make sure you can run all of these notebooks in your own time, on your own machine. This isn't required during class, but can be useful for learning once the session is over.
(data should come with repo, so no need to download fresh)
You will need to have Docker installed on your system to create images and run containers. You can find the installation steps for all platforms on the company's [website](https://docs.docker.com/install/)
.
```
docker build -t dl4nlp .
docker run -p LOCALPORT:8888 -d dl4nlp
```
1) Clone the repository for the class either using the UI or your terminal (see above)..
If running outside of docker, can do the following from root dir (where you can see day_1, day_2, etc):
2) Once you have Docker installed, type the following on your terminal to create a Docker image: `docker build -t NAME .` (replace `NAME`, here and in next step, with what you want to call it. Note the period)
```
jupyter notebook --notebook-dir=$(pwd)
```
3) That will take a little while to create a Docker image, but once completed, you can run your server with the following:
`docker run -p 8888:8888 NAME`
4) Head to `localhost:8888` in your browser and you will be able to access the Jupyter Notebooks.
Assuming all necessary things are installed. Required Python packages are in `requirements.txt`, except for PyTorch. Follow the installation instructions on the PyTorch website. Also visit [dl4nlp.info](http://dl4nlp.info/en/latest/) for more information about the current training session and [nlproc.info](http://nlproc.info/) for more resources from us!
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from annoy import AnnoyIndex\n",
"import numpy as np\n",
"import torch\n",
"from tqdm import tqdm_notebook\n",
"from argparse import Namespace"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"args = Namespace(\n",
" glove_filename='../data/glove.6B.100d.txt'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def load_word_vectors(filename):\n",
" word_to_index = {}\n",
" word_vectors = []\n",
" \n",
" with open(filename) as fp:\n",
" for line in tqdm_notebook(fp.readlines(), leave=False):\n",
" line = line.split(\" \")\n",
" \n",
" word = line[0]\n",
" word_to_index[word] = len(word_to_index)\n",
" \n",
" vec = np.array([float(x) for x in line[1:]])\n",
" word_vectors.append(vec)\n",
" \n",
" return word_to_index, word_vectors"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"class PreTrainedEmbeddings(object):\n",
" def __init__(self, glove_filename):\n",
" self.word_to_index, self.word_vectors = load_word_vectors(glove_filename)\n",
" self.word_vector_size = len(self.word_vectors[0])\n",
" \n",
" self.index_to_word = {v: k for k, v in self.word_to_index.items()}\n",
" self.index = AnnoyIndex(self.word_vector_size, metric='euclidean')\n",
" print('Building Index')\n",
" for _, i in tqdm_notebook(self.word_to_index.items(), leave=False):\n",
" self.index.add_item(i, self.word_vectors[i])\n",
" self.index.build(50)\n",
" print('Finished!')\n",
" \n",
" def get_embedding(self, word):\n",
" return self.word_vectors[self.word_to_index[word]]\n",
" \n",
" def closest(self, word, n=1):\n",
" vector = self.get_embedding(word)\n",
" nn_indices = self.index.get_nns_by_vector(vector, n)\n",
" return [self.index_to_word[neighbor] for neighbor in nn_indices]\n",
" \n",
" def closest_v(self, vector, n=1):\n",
" nn_indices = self.index.get_nns_by_vector(vector, n)\n",
" return [self.index_to_word[neighbor] for neighbor in nn_indices]\n",
" \n",
" def sim(self, w1, w2):\n",
" return np.dot(self.get_embedding(w1), self.get_embedding(w2))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Building Index\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished!\n"
]
}
],
"source": [
"glove = PreTrainedEmbeddings(args.glove_filename)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['apple', 'microsoft', 'dell', 'pc', 'compaq']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glove.closest('apple', n=5)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['plane', 'airplane', 'jet', 'flight', 'crashed']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glove.closest('plane', n=5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(26.873448266652, 16.501491855324)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glove.sim('beer', 'wine'), glove.sim('beer', 'gasoline')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Lexical relationships uncovered by word embeddings **"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def SAT_analogy(w1, w2, w3):\n",
" '''\n",
" Solves problems of the type:\n",
" w1 : w2 :: w3 : __\n",
" '''\n",
" closest_words = []\n",
" try:\n",
" w1v = glove.get_embedding(w1)\n",
" w2v = glove.get_embedding(w2)\n",
" w3v = glove.get_embedding(w3)\n",
" w4v = w3v + (w2v - w1v)\n",
" closest_words = glove.closest_v(w4v, n=5)\n",
" closest_words = [w for w in closest_words if w not in [w1, w2, w3]]\n",
" except:\n",
" pass\n",
" if len(closest_words) == 0:\n",
" print(':-(')\n",
" else:\n",
" the_closest_word = closest_words[0]\n",
" print('{} : {} :: {} : {}'.format(w1, w2, w3, the_closest_word))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Pronouns**"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"man : he :: woman : she\n"
]
}
],
"source": [
"SAT_analogy('man', 'he', 'woman')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Verb-Noun relationships **"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fly : plane :: sail : ship\n"
]
}
],
"source": [
"SAT_analogy('fly', 'plane', 'sail')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Noun-Noun relationships**"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cat : kitten :: dog : pug\n"
]
}
],
"source": [
"SAT_analogy('cat', 'kitten', 'dog')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"human : baby :: dog : puppy\n"
]
}
],
"source": [
"SAT_analogy('human', 'baby', 'dog')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"human : babies :: dog : puppies\n"
]
}
],
"source": [
"SAT_analogy('human', 'babies', 'dog')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Hypernymy**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"blue : color :: dog : animal\n"
]
}
],
"source": [
"SAT_analogy('blue', 'color', 'dog')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"**Meronymy**"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"leg : legs :: hand : hands\n"
]
}
],
"source": [
"SAT_analogy('leg', 'legs', 'hand')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Troponymy**"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"talk : communicate :: read : correctly\n"
]
}
],
"source": [
"SAT_analogy('talk', 'communicate', 'read')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Metonymy**"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"blue : democrat :: red : republican\n"
]
}
],
"source": [
"SAT_analogy('blue', 'democrat', 'red')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Misc**"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"man : doctor :: woman : nurse\n"
]
}
],
"source": [
"SAT_analogy('man', 'doctor', 'woman')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"man : leader :: woman : opposition\n"
]
}
],
"source": [
"SAT_analogy('man', 'leader', 'woman')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "magis",
"language": "python",
"name": "magis"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
from collections import Counter
import numpy as np
from torch.utils.data import Dataset
import six
import json