Commit c8ff14f7 by Brian McMahan

added nltk data; bert; updated day 1 MLP code to include validation sets and…

added nltk data; bert; updated day 1 MLP code to include validation sets and reproducibiltiy; fixed CYOA-Amazon-Reviews; added BiRNN
parent dcec0acc
......@@ -6,8 +6,6 @@ WORKDIR /home/jovyan/
# Modules
COPY requirements.txt /home/jovyan/requirements.txt
RUN pip install -r /home/jovyan/requirements.txt
RUN pip install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp37-cp37m-linux_x86_64.whl
RUN pip install torchvision
RUN jupyter nbextension enable --py widgetsnbextension
......
<package id='maxent_ne_chunker'
name='ACE Named Entity Chunker (Maximum entropy)'
languages="English"
unzip="1"
/>
<package id="abc" name="Australian Broadcasting Commission 2006"
webpage="http://www.abc.net.au/" author="Australian Broadcasting Commission"
unzip="1"
/>
<package id="alpino" name="Alpino Dutch Treebank"
webpage="http://www.let.rug.nl/~vannoord/trees/"
contact="Gertjan van Noord"
license="Distributed with permission of Gertjan van Noord"
unzip="1"
/>
<package id="biocreative_ppi"
name="BioCreAtIvE (Critical Assessment of Information Extraction Systems in Biology)"
webpage="http://www.mitre.org/public/biocreative/"
copyright="Public Domain (not copyrighted)"
license="Public Domain"
unzip="1"
/>
<package id="brown" name="Brown Corpus"
author="W. N. Francis and H. Kucera"
license="May be used for non-commercial purposes."
webpage="http://www.hit.uib.no/icame/brown/bcm.html"
unzip="1"
/>
<package id="brown_tei" name="Brown Corpus (TEI XML Version)"
author="W. N. Francis and H. Kucera"
license="May be used for non-commercial purposes."
webpage="http://www.hit.uib.no/icame/brown/bcm.html"
contact="Lou Burnard -- lou.burnard@oucs.ox.ac.uk"
unzip="1"
/>
<package id="cess_cat" name="CESS-CAT Treebank"
webpage="http://clic.ub.edu/cessece/"
license="If you use these corpora for research, please cite thusly: CESS-Cat project (M. Antonia Martí, MarionaTaulé, Lluís Márquez, Manuel Bertran (2007) ?CESS-ECE: A Multilingual and Multilevel Annotated Corpus? in http://www.lsi.upc.edu/~mbertran/cess-ece/publications)."
unzip="1"
/>
<package id="cess_esp" name="CESS-ESP Treebank"
webpage="http://clic.ub.edu/cessece/"
license="If you use these corpora for research, please cite thusly: CESS-Cat project (M. Antonia Martí, MarionaTaulé, Lluís Márquez, Manuel Bertran (2007) ?CESS-ECE: A Multilingual and Multilevel Annotated Corpus? in http://www.lsi.upc.edu/~mbertran/cess-ece/publications)."
unzip="1"
/>
<package id="chat80" name="Chat-80 Data Files"
copyright="Copyright (C) 1982 David Warren and Fernando Pereira"
license="This program may be used, copied, altered or included in other programs only for academic purposes and provided that the authorship of the initial program is aknowledged. Use for commercial purposes without the previous written agreement of the authors is forbidden."
author="David Warren and Fernando Pereira"
webpage="http://www.cis.upenn.edu/~pereira/oldies.html"
unzip="1"
/>
<package id="city_database"
name="City Database"
note="A very small database of information about cities"
unzip="1"
/>
<package id="cmudict"
name="The Carnegie Mellon Pronouncing Dictionary (0.6)"
webpage="ftp://ftp.cs.cmu.edu/project/speech/dict/"
copyright="Copyright 1998 Carnegie Mellon University"
license="Use of this dictionary, for any research or commercial purpose, is completely unrestricted. If you use or redistribute this material, we would appreciate acknowlegement of its origin."
unzip="1"
/>
<package id="comparative_sentences"
name="Comparative Sentence Dataset"
copyright="Copyright (C) 2006 Nitin Jindal and Bing Liu"
author="Nitin Jindal and Bing Liu"
license="Creative Commons Attribution 4.0 International"
licenseurl = "http://creativecommons.org/licenses/by/4.0/"
webpage="http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#datasets"
unzip="1"
/>
<package id="comtrans" name="ComTrans Corpus Sample"
author="Reinhard Rapp"
webpage="http://www.fask.uni-mainz.de/user/rapp/comtrans/"
unzip="0"
/>
<package id="conll2000" name="CONLL 2000 Chunking Corpus"
webpage="http://www.cnts.ua.ac.be/conll2000/chunking/"
contact="Erik Tjong Kim Sang (erikt@uia.ua.ac.be)"
unzip="1"
/>
<package id="conll2002" name="CONLL 2002 Named Entity Recognition Corpus"
webpage="http://www.cnts.ua.ac.be/conll2002/ner/"
unzip="1"
/>
<package id="conll2007" name="Dependency Treebanks from CoNLL 2007 (Catalan and Basque Subset)"
webpage="http://nextens.uvt.nl/depparse-wiki/DataDownload"
contact="Kepa Sarasola"
copyright="Copyright (C) 2007 The University of the Basque Country"
license="Creative Commons Attribution-NonCommercial-NoDerivativeWorks license"
unzip="0"
/>
<package id="crubadan" name="Crubadan Corpus"
copyright="Copyright (C) 2010 Kevin Scannell"
author="Kevin Scannell"
license="GPLv3"
webpage="http://borel.slu.edu/crubadan/"
unzip="1"
/>
<package id="dependency_treebank" name="Dependency Parsed Treebank"
sample="True"
copyright="Copyright (C) 1995 University of Pennsylvania"
license="This is a 10% fragment of Penn Treebank, (C) LDC 1995, which has been dependency parsed. It is made available under fair use for the purposes of illustrating NLTK tools for tokenizing, tagging, chunking and parsing. This data is for non-commercial use only."
unzip="1"
/>
<package id="dolch" name="Dolch Word List"
webpage="https://en.wikipedia.org/wiki/Dolch_word_list"
unzip="1"
/>
<package id="europarl_raw" name="Sample European Parliament Proceedings Parallel Corpus"
author="Philipp Koehn, University of Edinburgh"
webpage="http://www.statmt.org/europarl"
unzip="1"
/>
<package id="floresta" name="Portuguese Treebank"
license="Non-commercial use only"
webpage="http://www.linguateca.pt/Floresta/"
unzip="1"
/>
<package id="framenet_v15" name="FrameNet 1.5"
author="Collin F. Baker"
license="May be used for non-commercial purposes."
webpage="http://framenet.icsi.berkeley.edu"
unzip="1"
/>
<package id="framenet_v17" name="FrameNet 1.7"
author="Collin F. Baker"
license="Creative Commons Attribution 3.0 Unported License"
webpage="http://framenet.icsi.berkeley.edu"
unzip="1"
/>
<package id="gazetteers" name="Gazeteer Lists"
license="GNU Free Documentation License; or public domain (depending on the file)"
unzip="1"
/>
<package id="genesis" name="Genesis Corpus"
copyright="public domain"
license="public domain"
unzip="1"
/>
<package id="gutenberg" name="Project Gutenberg Selections"
webpage="http://gutenberg.net/"
license="public domain"
copyright="public domain"
unzip="1"
/>
<package id="ieer" name="NIST IE-ER DATA SAMPLE"
webpage="http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm"
unzip="1"
/>
<package id="inaugural" name="C-Span Inaugural Address Corpus"
copyright="public domain"
license="public domain"
unzip="1"
/>
<package id="indian" name="Indian Language POS-Tagged Corpus"
author="A Kumaran"
license="Distributed with permission"
unzip="1"
/>
<package id="jeita"
name="JEITA Public Morphologically Tagged Corpus (in ChaSen format)"
webpage="http://lilyx.net/pages/nltkjapanesecorpus.html"
license="Freely re-distributable under the same license as the original JEITA corpus. Each document retains its own license from Aozora bunko and Project Sugita Genpaku."
unzip="0" />
<package id="kimmo" name="PC-KIMMO Data Files"
webpage="http://www.sil.org/pckimmo/"
unzip="1"
/>
<package id="knbc" name="KNB Corpus (Annotated blog corpus)"
webpage="http://lilyx.net/pages/nltkjapanesecorpus.html"
license="Freely re-distributable under the same license as the original KNB Corpus."
unzip="0" />
<package id="lin_thesaurus"
name="Lin's Dependency Thesaurus"
author="Dekang Lin"
webpage="http://webdocs.cs.ualberta.ca/~lindek/downloads.htm"
license="Distributed with permission of Dekang Lin"
unzip="1"
/>
#nltk_directory,short_name,long_name,contents,derived_from,reader,licenser1,licenser2,see_also,contact_name,contact_email
#in nltk/corpora,for use in book,,,,in nltk/nltk/corpora,,for derived works,,,
abc,abc,Australian Broadcasting Commission 2006,Science News::Rural News,,,,,,,
biocreative_ppi,BioCreAtIvE-PPI,BioCreAtIvE Protein-Protein Interaction Corpus,BioCreatIve Task 1a,,,,NCBI,http://www2.informatik.hu-berlin.de/~hakenber/corpora/::http://www.mitre.org/public/biocreative/,,
brown,brown,Brown Corpus,,,brown.py,,,,,
chat80,chat80,Chat-80 Database,,,chat80.py,University of Pennsylvania,,http://www.cis.upenn.edu/~pereira/oldies.html,Fernando Pereira,
cmudict,cmudict,Carnegie Mellon Pronouncing Dictionary,,,cmudict.py,,,,,
conll2000,conll2000,CoNLL 2000 Chunking Corpus,,,conll2000.py,,,,,
conll2002,conll2002,CoNLL 2002 NER Corpus,Dutch::Spanish,,conll2002.py,,,,,
dolch,dolch,Dolch Word List,,,,,,,,
genesis,genesis,Genesis Corpus,,,genesis.py,,,,,
gutenberg,gutenberg,Project Gutenberg Selections,,,gutenberg.py,,,,,
ieer,ieer,NIST 1999 Information Extraction ,Entity Recognition Corpus,,ieer.py,,,,,
inaugural,inaugural,US Presidential Inaugural Address Corpus,,,inaugural.py,,,,,
indian,indian,Indian Language POS-Tagged Corpus,Bangla::Hindi::Marathi::Telugu,,indian.py,,,,,
kimmo,kimmo,,,,,,,,,
names,names,Names Corpus,,,names.py,,,
paradigms,paradigms,Paradigm Corpus,,,,,,
pil,pil,,,,,,,
ppattach,ppattach,PP Attachment Corpus,,IBM-Lancaster Treebank of Computer Manuals::Penn Treebank,ppattach.py,,,
problem_reports,problem_reports,,,,,,,
senseval,senseval,SENSEVAL 2 Corpus,,,senseval.py,,,
shakespeare,shakespeare,Shakespeare XML Corpus Sample,,,shakespeare.py,,,
sinica_treebank,sinica_treebank,Sinica Treebank Corpus Sample,,,sinica_treebank.py,Academia Sinica,,
state_union,state_union,US Presidential State of the Union Address Corpus,,,state_union.py,,,
stopwords,stopwords,Stopwords Corpus,,,stopwords.py,,,
switchboard,switchboard,,,,,,,
timit,timit,TIMIT Corpus Sample,,,timit.py,,,
toolbox,toolbox,Toolbox Data Samples,,,toolbox.py,,,
treebank,Penn Treebank,Penn Treebank Corpus Sample,,Wall Street Journal,treebank.py,LDC,,http://www.cis.upenn.edu/~treebank/
udhr,udhr,Universal Declaration of Human Rights Corpus,,,udhr.py,,,
web,web,,overheard::wine::pirates::singles,,web.py,,,
wordnet,wordnet,Wordnet 3.0,,,,Princeton University,,http://www.cogsci.princeton.edu/~wn
words,words,Wordlist (English),,,words.py,,,
<package id="mac_morpho"
name="MAC-MORPHO: Brazilian Portuguese news text with part-of-speech tags"
webpage="http://www.nilc.icmc.usp.br/lacioweb/"
license="Distributed with permission of Núcleo Interinstitucional de Lingüística Computacional (NILC), Universidade de São Paulo (USP) in São Carlos, Universidade Federal de São Carlos (UFSCar), Universidade Estadual Paulista (UNESP) of Araraquara."
unzip="1"
/>
<package id="machado" name="Machado de Assis -- Obra Completa"
author="Machado de Assis"
license="Public Domain"
webpage="http://machado.mec.gov.br/"
unzip="0"
/>
<package id="masc_tagged" name="MASC Tagged Corpus"
copyright="Copyright (C) 2014 American National Corpus"
author="Nancy Ide"
license="This data may be used for the purposes of linguistic education, research, and development, including commercial development."
webpage="http://www.anc.org/"
unzip="0"
/>
<package id="movie_reviews"
name="Sentiment Polarity Dataset Version 2.0"
author="Bo Pang and Lillian Lee"
copyright="Copyright (C) 2004 Bo Pang and Lillian Lee"
webpage="http://www.cs.cornell.edu/people/pabo/movie-review-data/"
license="Creative Commons Attribution 4.0 International"
licenseurl = "http://creativecommons.org/licenses/by/4.0/"
unzip="1"
/>
<package id="mte_teip5" name="MULTEXT-East 1984 annotated corpus 4.0"
author="Erjavec, Tomaž; Barbu, Ana-Maria; Derzhanski, Ivan; Dimitrova, Ludmila; Garabík, Radovan; Ide, Nancy; Kaalep, Heiki-Jaan; Kotsyba, Natalia; Krstev, Cvetana; Oravecz, Csaba; Petkevič, Vladimír; Priest-Dorman, Greg; QasemiZadeh, Behrang; Radziszewski, Adam; Simov, Kiril; Tufiş, Dan and Zdravkova, Katerina"
license="Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)"
webpage="https://www.clarin.si/repository/xmlui/handle/11356/1043"
unzip="1"
/>
<package id="names" name="Names Corpus, Version 1.3 (1994-03-29)"
copyright="Copyright (C) 1991 Mark Kantrowitz"
author="Mark Kantrowitz and Bill Ross"
license="You may use the lists of names for any purpose, so long as credit is given in any published work. You may also redistribute the list if you provide the recipients with a copy of this README file. The lists are not in the public domain (I retain the copyright on the lists) but are freely redistributable. If you have any additions to the lists of names, I would appreciate receiving them."
webpage="http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/"
unzip="1"
/>
<package id="nombank.1.0" name="NomBank Corpus 1.0"
contact="Adam Meyers"
webpage="http://nlp.cs.nyu.edu/meyers/NomBank.html"
license="Distributed with permission"
unzip="0"
/>
<package id="nonbreaking_prefixes" name="Non-Breaking Prefixes (Moses Decoder)"
webpage="https://github.com/moses-smt/mosesdecoder/tree/master/scripts/share/nonbreaking_prefixes"
license="Gnu LGPL"
unzip="1"
/>
<package id="nps_chat" name="NPS Chat"
author="Craig Martell (cmartell@nps.edu)"
webpage="http://faculty.nps.edu/cmartell/NPSChat.htm"
license="This corpus is distributed solely for non-commercial, non-profit educational and research use. It is a derivative compilation work of multiple works whose copyrights are held by the respective original authors."
unzip="1"
/>
<package id="omw" name="Open Multilingual Wordnet"
author="Francis Bond"
license="Please consult the LICENSE files included with the individual Wordnets. Note that all permit redistribution."
copyright="Please consult the copyright statements of the individual Wordnets"
webpage="http://compling.hss.ntu.edu.sg/omw/"
unzip="1"
/>
<package id="opinion_lexicon" name="Opinion Lexicon"
author="Bing Liu"
copyright="Copyright (C) 2011 Bing Liu"
license="Creative Commons Attribution 4.0 International"
licenseurl = "http://creativecommons.org/licenses/by/4.0/"
webpage="http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#datasets"
unzip="1"
/>
<package id="panlex_swadesh" name="PanLex Swadesh Corpora"
author="Jonathan Pool (editor)"
license="CC0 1.0 Universal"
webpage="http://panlex.org/"
unzip="0"
/>
<package id="paradigms" name="Paradigm Corpus"
author="Cathy Bow, University of Melbourne"
license="Distributed with the permission of the author"
unzip="1"
/>
<package id="pe08" name="Cross-Framework and Cross-Domain Parser Evaluation Shared Task"
version="Release 3 (20 April 2008)"
webpage=" http://www-tsujii.is.s.u-tokyo.ac.jp/pe08-st/"
license="Distributed with permission"
unzip="1"
/>
\ No newline at end of file
<package id="pil" name="The Patient Information Leaflet (PIL) Corpus"
version="Version 2.0 (31 March 2006)"
webpage="http://mcs.open.ac.uk/nlg/old_projects/pills/corpus/"
license="Distributed with permission"
unzip="1"
/>
<package id="pl196x" name="Polish language of the XX century sixties"
author="I. Kurcz, A. Lewicki, J. Sambor, K. Szafran, J. Woronczak"
license="GNU General Public License"
webpage="http://www.mimuw.edu.pl/polszczyzna/pl196x/index_en.htm"
unzip="1"
/>