Commit 7cfbf065 by Brian McMahan

the big push

parent 1bc1acd1
FROM jupyter/<notebook>:<version hash>
FROM jupyter/minimal-notebook:4cdbc9cdb7d1
#Set the working directory
WORKDIR /home/jovyan/
......@@ -6,11 +6,16 @@ WORKDIR /home/jovyan/
# Modules
COPY requirements.txt /home/jovyan/requirements.txt
RUN pip install -r /home/jovyan/requirements.txt
RUN pip install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp37-cp37m-linux_x86_64.whl
RUN pip install torchvision
RUN jupyter nbextension enable --py widgetsnbextension
# Add files
COPY notebooks /home/jovyan/notebooks
COPY day_1 /home/jovyan/day_1
COPY day_2 /home/jovyan/day_2
COPY data /home/jovyan/data
COPY solutions /home/jovyan/solutions
COPY modelzoo /home/jovyan/modelzoo
# Allow user to write to directory
USER root
......
# ${COURSE TITLE GOES HERE}
# O'Reilly Artificial Intelligence Conference
![Powered by Jupyter Logo](https://cdn.oreillystatic.com/images/icons/powered_by_jupyter.png)
## Natural Language Processing with Deep Learning training
This project contains the Jupyter Notebooks and the associated Dockerfile for ${INSTRUCTOR} _${TITLE}_. It contains both the exercises (/notebooks) and the solutions (/solutions), as well as any data or files needed (/data).
#### Delip Rao, Brian McMahan
This is a public repository so there is no need to create an account to download its contents. To download the source code from this page, click the 'Cloud' icon on the top right hand, above where the latest commit is detailed.
To download via git from your preferred terminal application, type:
## Docker Instructions
```git clone https://resources.oreilly.com/live-training/${COMPLETE URL}```
This is the Dockerfile creation repository.
## Running Jupyter Locally via Docker
Start from the root directory which contains the Dockerfile,
please run the following commands (replacing LOCALPORT with whatever you'd like)
We've shared the same Dockerfile we use for our JupyterHub session in this repository, to make sure you can run all of these notebooks in your own time, on your own machine. This isn't required during class, but can be useful for learning once the session is over.
(data should come with repo, so no need to download fresh)
You will need to have Docker installed on your system to create images and run containers. You can find the installation steps for all platforms on the company's [website](https://docs.docker.com/install/)
.
```
docker build -t dl4nlp .
docker run -p LOCALPORT:8888 -d dl4nlp
```
1) Clone the repository for the class either using the UI or your terminal (see above)..
If running outside of docker, can do the following from root dir (where you can see day_1, day_2, etc):
2) Once you have Docker installed, type the following on your terminal to create a Docker image: `docker build -t NAME .` (replace `NAME`, here and in next step, with what you want to call it. Note the period)
```
jupyter notebook --notebook-dir=$(pwd)
```
3) That will take a little while to create a Docker image, but once completed, you can run your server with the following:
`docker run -p 8888:8888 NAME`
4) Head to `localhost:8888` in your browser and you will be able to access the Jupyter Notebooks.
Assuming all necessary things are installed. Required Python packages are in `requirements.txt`, except for PyTorch. Follow the installation instructions on the PyTorch website. Also visit [dl4nlp.info](http://dl4nlp.info/en/latest/) for more information about the current training session and [nlproc.info](http://nlproc.info/) for more resources from us!
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from collections import Counter
import numpy as np
from torch.utils.data import Dataset
import six
import json
class Vocabulary(object):
"""
An implementation that manages the interface between a token dataset and the
machine learning algorithm.
"""
def __init__(self, use_unks=False, unk_token="<UNK>",
use_mask=False, mask_token="<MASK>", use_start_end=False,
start_token="<START>", end_token="<END>"):
"""
Args:
use_unks (bool): The vocabulary will output UNK tokens for out of
vocabulary items.
[default=False]
unk_token (str): The token used for unknown tokens.
If `use_unks` is True, this will be added to the vocabulary.
[default='<UNK>']
use_mask (bool): The vocabulary will reserve the 0th index for a mask token.
This is used to handle variable lengths in sequence models.
[default=False]
mask_token (str): The token used for the mask.
Note: mostly a placeholder; it's unlikely the token will be seen.
[default='<MASK>']
use_start_end (bool): The vocabulary will reserve indices for two tokens
that represent the start and end of a sequence.
[default=False]
start_token: The token used to indicate the start of a sequence.
If `use_start_end` is True, this will be added to the vocabulary.
[default='<START>']
end_token: The token used to indicate the end of a sequence
If `use_start_end` is True, this will be added to the vocabulary.
[default='<END>']
"""
self._mapping = {} # str -> int
self._flip = {} # int -> str;
self._i = 0
self._frozen = False
# mask token for use in masked recurrent networks
# usually need to be the 0th index
self.use_mask = use_mask
self.mask_token = mask_token
if self.use_mask:
self.add(self.mask_token)
# unk token for out of vocabulary tokens
self.use_unks = use_unks
self.unk_token = unk_token
if self.use_unks:
self.add(self.unk_token)
# start token for sequence models
self.use_start_end = use_start_end
self.start_token = start_token
self.end_token = end_token
if self.use_start_end:
self.add(self.start_token)
self.add(self.end_token)
def iterkeys(self):
for k in self._mapping.keys():
if k == self.unk_token or k == self.mask_token:
continue
else:
yield k
def keys(self):
return list(self.iterkeys())
def iteritems(self):
for key, value in self._mapping.items():
if key == self.unk_token or key == self.mask_token:
continue
yield key, value
def items(self):
return list(self.iteritems())
def values(self):
return [value for _, value in self.iteritems()]
def __getitem__(self, k):
if self._frozen:
if k in self._mapping:
out_index = self._mapping[k]
elif self.use_unks:
out_index = self.unk_index
else: # case: frozen, don't want unks, raise exception
raise VocabularyException("Vocabulary is frozen. " +
"Key '{}' not found.".format(k))
elif k in self._mapping: # case: normal
out_index = self._mapping[k]
else:
out_index = self._mapping[k] = self._i
self._i += 1
self._flip[out_index] = k
return out_index
def add(self, k):
return self.__getitem__(k)
def add_many(self, x):
return [self.add(k) for k in x]
def lookup(self, i):
try:
return self._flip[i]
except KeyError:
raise VocabularyException("Key {} not in Vocabulary".format(i))
def lookup_many(self, x):
for k in x:
yield self.lookup(k)
def map(self, sequence, include_start_end=False):
if include_start_end:
yield self.start_index
for item in sequence:
yield self[item]
if include_start_end:
yield self.end_index
def freeze(self, use_unks=False):
self.use_unks = use_unks
if use_unks and self.unk_token not in self:
self.add(self.unk_token)
self._frozen = True
def unfreeze(self):
self._frozen = False
@property
def unk_index(self):
if self.unk_token not in self:
return None
return self._mapping[self.unk_token]
@property
def mask_index(self):
if self.mask_token not in self:
return None
return self._mapping[self.mask_token]
@property
def start_index(self):
if self.start_token not in self:
return None
return self._mapping[self.start_token]
@property
def end_index(self):
if self.end_token not in self:
return None
return self._mapping[self.end_token]
def __contains__(self, k):
return k in self._mapping
def __len__(self):
return len(self._mapping)
def __repr__(self):
return "<Vocabulary(size={},frozen={})>".format(len(self), self._frozen)
def get_serializable_contents(self):
"""
Creats a dict containing the necessary information to recreate this instance
"""
config = {"_mapping": self._mapping,
"_flip": self._flip,
"_frozen": self._frozen,
"_i": self._i,
"_counts": list(self._counts.items()),
"_frequency_threshold": self._frequency_threshold,
"use_unks": self.use_unks,
"unk_token": self.unk_token,
"use_mask": self.use_mask,
"mask_token": self.mask_token,
"use_start_end": self.use_start_end,
"start_token": self.start_token,
"end_token": self.end_token}
return config
@classmethod
def deserialize_from_contents(cls, content):
"""
Recreate a Vocabulary instance; expect same dict as output in `serialize`
"""
try:
_mapping = content.pop("_mapping")
_flip = content.pop("_flip")
_i = content.pop("_i")
_frozen = content.pop("_frozen")
_counts = content.pop("_counts")
_frequency_threshold = content.pop("_frequency_threshold")
except KeyError:
raise Exception("unable to deserialize vocabulary")
if isinstance(list(_flip.keys())[0], six.string_types):
_flip = {int(k): v for k, v in _flip.items()}
out = cls(**content)
out._mapping = _mapping
out._flip = _flip
out._i = _i
out._counts = Counter(dict(_counts))
out._frequency_threshold = _frequency_threshold
if _frozen:
out.freeze(out.use_unks)
return out
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from collections import Counter
import numpy as np
from torch.utils.data import Dataset
import six
import json
class Vocabulary(object):
"""
An implementation that manages the interface between a token dataset and the
machine learning algorithm.
"""
def __init__(self, use_unks=False, unk_token="<UNK>",
use_mask=False, mask_token="<MASK>", use_start_end=False,
start_token="<START>", end_token="<END>"):
"""
Args:
use_unks (bool): The vocabulary will output UNK tokens for out of
vocabulary items.
[default=False]
unk_token (str): The token used for unknown tokens.
If `use_unks` is True, this will be added to the vocabulary.
[default='<UNK>']
use_mask (bool): The vocabulary will reserve the 0th index for a mask token.
This is used to handle variable lengths in sequence models.
[default=False]
mask_token (str): The token used for the mask.
Note: mostly a placeholder; it's unlikely the token will be seen.
[default='<MASK>']
use_start_end (bool): The vocabulary will reserve indices for two tokens
that represent the start and end of a sequence.
[default=False]
start_token: The token used to indicate the start of a sequence.
If `use_start_end` is True, this will be added to the vocabulary.
[default='<START>']
end_token: The token used to indicate the end of a sequence
If `use_start_end` is True, this will be added to the vocabulary.
[default='<END>']
"""
self._mapping = {} # str -> int
self._flip = {} # int -> str;
self._i = 0
self._frozen = False
# mask token for use in masked recurrent networks
# usually need to be the 0th index
self.use_mask = use_mask
self.mask_token = mask_token
if self.use_mask:
self.add(self.mask_token)
# unk token for out of vocabulary tokens
self.use_unks = use_unks
self.unk_token = unk_token
if self.use_unks:
self.add(self.unk_token)
# start token for sequence models
self.use_start_end = use_start_end
self.start_token = start_token
self.end_token = end_token
if self.use_start_end:
self.add(self.start_token)
self.add(self.end_token)
def iterkeys(self):
for k in self._mapping.keys():
if k == self.unk_token or k == self.mask_token:
continue
else:
yield k
def keys(self):
return list(self.iterkeys())
def iteritems(self):
for key, value in self._mapping.items():
if key == self.unk_token or key == self.mask_token:
continue
yield key, value
def items(self):
return list(self.iteritems())
def values(self):
return [value for _, value in self.iteritems()]
def __getitem__(self, k):
if self._frozen:
if k in self._mapping:
out_index = self._mapping[k]
elif self.use_unks:
out_index = self.unk_index
else: # case: frozen, don't want unks, raise exception
raise VocabularyException("Vocabulary is frozen. " +
"Key '{}' not found.".format(k))
elif k in self._mapping: # case: normal
out_index = self._mapping[k]
else:
out_index = self._mapping[k] = self._i
self._i += 1
self._flip[out_index] = k
return out_index
def add(self, k):
return self.__getitem__(k)
def add_many(self, x):
return [self.add(k) for k in x]
def lookup(self, i):
try:
return self._flip[i]
except KeyError:
raise VocabularyException("Key {} not in Vocabulary".format(i))
def lookup_many(self, x):
for k in x:
yield self.lookup(k)
def map(self, sequence, include_start_end=False):
if include_start_end:
yield self.start_index
for item in sequence:
yield self[item]
if include_start_end:
yield self.end_index
def freeze(self, use_unks=False):
self.use_unks = use_unks
if use_unks and self.unk_token not in self:
self.add(self.unk_token)
self._frozen = True
def unfreeze(self):
self._frozen = False
@property
def unk_index(self):
if self.unk_token not in self:
return None
return self._mapping[self.unk_token]
@property
def mask_index(self):
if self.mask_token not in self:
return None
return self._mapping[self.mask_token]
@property
def start_index(self):
if self.start_token not in self:
return None
return self._mapping[self.start_token]
@property
def end_index(self):
if self.end_token not in self:
return None
return self._mapping[self.end_token]
def __contains__(self, k):
return k in self._mapping
def __len__(self):
return len(self._mapping)
def __repr__(self):
return "<Vocabulary(size={},frozen={})>".format(len(self), self._frozen)
def get_serializable_contents(self):
"""
Creats a dict containing the necessary information to recreate this instance
"""
config = {"_mapping": self._mapping,
"_flip": self._flip,
"_frozen": self._frozen,
"_i": self._i,
"_counts": list(self._counts.items()),
"_frequency_threshold": self._frequency_threshold,
"use_unks": self.use_unks,
"unk_token": self.unk_token,
"use_mask": self.use_mask,
"mask_token": self.mask_token,
"use_start_end": self.use_start_end,
"start_token": self.start_token,
"end_token": self.end_token}
return config
@classmethod
def deserialize_from_contents(cls, content):
"""
Recreate a Vocabulary instance; expect same dict as output in `serialize`
"""
try:
_mapping = content.pop("_mapping")
_flip = content.pop("_flip")
_i = content.pop("_i")
_frozen = content.pop("_frozen")
_counts = content.pop("_counts")
_frequency_threshold = content.pop("_frequency_threshold")
except KeyError:
raise Exception("unable to deserialize vocabulary")
if isinstance(list(_flip.keys())[0], six.string_types):
_flip = {int(k): v for k, v in _flip.items()}
out = cls(**content)
out._mapping = _mapping
out._flip = _flip
out._i = _i
out._counts = Counter(dict(_counts))
out._frequency_threshold = _frequency_threshold
if _frozen:
out.freeze(out.use_unks)
return out
from collections import Counter
import numpy as np
from torch.utils.data import Dataset
import six
import json
class Vocabulary(object):
"""
An implementation that manages the interface between a token dataset and the
machine learning algorithm.
"""
def __init__(self, use_unks=False, unk_token="<UNK>",
use_mask=False, mask_token="<MASK>", use_start_end=False,
start_token="<START>", end_token="<END>"):
"""
Args:
use_unks (bool): The vocabulary will output UNK tokens for out of
vocabulary items.
[default=False]
unk_token (str): The token used for unknown tokens.
If `use_unks` is True, this will be added to the vocabulary.
[default='<UNK>']
use_mask (bool): The vocabulary will reserve the 0th index for a mask token.
This is used to handle variable lengths in sequence models.
[default=False]
mask_token (str): The token used for the mask.
Note: mostly a placeholder; it's unlikely the token will be seen.
[default='<MASK>']
use_start_end (bool): The vocabulary will reserve indices for two tokens
that represent the start and end of a sequence.
[default=False]
start_token: The token used to indicate the start of a sequence.
If `use_start_end` is True, this will be added to the vocabulary.
[default='<START>']
end_token: The token used to indicate the end of a sequence
If `use_start_end` is True, this will be added to the vocabulary.
[default='<END>']
"""
self._mapping = {} # str -> int
self._flip = {} # int -> str;
self._i = 0
self._frozen = False
# mask token for use in masked recurrent networks
# usually need to be the 0th index
self.use_mask = use_mask
self.mask_token = mask_token
if self.use_mask:
self.add(self.mask_token)
# unk token for out of vocabulary tokens
self.use_unks = use_unks
self.unk_token = unk_token
if self.use_unks:
self.add(self.unk_token)
# start token for sequence models
self.use_start_end = use_start_end
self.start_token = start_token
self.end_token = end_token
if self.use_start_end:
self.add(self.start_token)
self.add(self.end_token)
def iterkeys(self):
for k in self._mapping.keys():
if k == self.unk_token or k == self.mask_token:
continue
else:
yield k
def keys(self):
return list(self.iterkeys())
def iteritems(self):
for key, value in self._mapping.items():
if key == self.unk_token or key == self.mask_token:
continue
yield key, value
def items(self):
return list(self.iteritems())
def values(self):
return [value for _, value in self.iteritems()]
def __getitem__(self, k):
if self._frozen:
if k in self._mapping:
out_index = self._mapping[k]
elif self.use_unks:
out_index = self.unk_index
else: # case: frozen, don't want unks, raise exception
raise VocabularyException("Vocabulary is frozen. " +
"Key '{}' not found.".format(k))
elif k in self._mapping: # case: normal
out_index = self._mapping[k]
else:
out_index = self._mapping[k] = self._i
self._i += 1
self._flip[out_index] = k
return out_index
def add(self, k):
return self.__getitem__(k)
def add_many(self, x):
return [self.add(k) for k in x]
def lookup(self, i):
try:
return self._flip[i]
except KeyError:
raise VocabularyException("Key {} not in Vocabulary".format(i))
def lookup_many(self, x):
for k in x: