diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a11ffffe6aa23396e71943b522dc17bbf975b5bb..2f2b2621d20ff9c89135bf94259f47d91f67c7d5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,6 +5,7 @@ 课程视频: + [斯坦福 CS224n 深度学习自然语言处理课程 2019](https://www.bilibili.com/video/av46216519) ++ [字幕(BCC 格式)](https://github.com/apachecn/stanford-cs224n-notes-zh/tree/master/bcc-en) 负责人: @@ -12,6 +13,27 @@ ## 章节列表 ++ [Lecture 1](https://www.bilibili.com/video/av46216519/?p=1) ++ [Lecture 2](https://www.bilibili.com/video/av46216519/?p=2) ++ [Lecture 3](https://www.bilibili.com/video/av46216519/?p=3) ++ [Lecture 4](https://www.bilibili.com/video/av46216519/?p=4) ++ [Lecture 5](https://www.bilibili.com/video/av46216519/?p=5) ++ [Lecture 6](https://www.bilibili.com/video/av46216519/?p=6) ++ [Lecture 7](https://www.bilibili.com/video/av46216519/?p=7) ++ [Lecture 8](https://www.bilibili.com/video/av46216519/?p=8) ++ [Lecture 9](https://www.bilibili.com/video/av46216519/?p=9) ++ [Lecture 10](https://www.bilibili.com/video/av46216519/?p=10) ++ [Lecture 11](https://www.bilibili.com/video/av46216519/?p=11) ++ [Lecture 12](https://www.bilibili.com/video/av46216519/?p=12) ++ [Lecture 13](https://www.bilibili.com/video/av46216519/?p=13) ++ [Lecture 14](https://www.bilibili.com/video/av46216519/?p=14) ++ [Lecture 15](https://www.bilibili.com/video/av46216519/?p=15) ++ [Lecture 16](https://www.bilibili.com/video/av46216519/?p=16) ++ [Lecture 17](https://www.bilibili.com/video/av46216519/?p=17) ++ [Lecture 18](https://www.bilibili.com/video/av46216519/?p=18) ++ [Lecture 19](https://www.bilibili.com/video/av46216519/?p=19) ++ [Lecture 20](https://www.bilibili.com/video/av46216519/?p=20) + ## 流程 ### 一、认领 @@ -22,7 +44,7 @@ ### 二、整理笔记 -+ [下载英文字幕](https://www.bilibili.com/video/av46216519) ++ [下载英文字幕](https://github.com/apachecn/stanford-cs224n-notes-zh/tree/master/bcc-en) + 翻译(可以利用[谷歌翻译](https://translate.google.cn),但一定要把它变得可读) + 排版成段落,并添加视频截图 diff --git a/bcc-en/1.bcc b/bcc-en/1.bcc new file mode 100644 index 0000000000000000000000000000000000000000..cecd446c11fbc5b8f0eae9e2fefda2b6a1c806c9 --- /dev/null +++ b/bcc-en/1.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.43,"to":7.41,"location":2,"content":"Okay. Hello everyone."},{"from":7.41,"to":11.27,"location":2,"content":"[LAUGHTER] Okay we should get started."},{"from":11.27,"to":14.64,"location":2,"content":"Um, they're actually are still quite a few seats left."},{"from":14.64,"to":15.96,"location":2,"content":"If you wanna be really bold,"},{"from":15.96,"to":18.52,"location":2,"content":"there are a couple of seats right in front of me in the front row."},{"from":18.52,"to":20.45,"location":2,"content":"If you're less bolder a few over there."},{"from":20.45,"to":23.94,"location":2,"content":"Um, but they're also on some of the rows are quite a few middle seat."},{"from":23.94,"to":28.08,"location":2,"content":"So if people wanted to be really civic minded some people could sort of"},{"from":28.08,"to":32.28,"location":2,"content":"squeeze towards the edges and make more accessible um,"},{"from":32.28,"to":35.69,"location":2,"content":"some of the seats that still exist in the classroom."},{"from":35.69,"to":39.44,"location":2,"content":"Okay. Um, so, um,"},{"from":39.44,"to":42.89,"location":2,"content":"it's really exciting and great to see so many people here."},{"from":42.89,"to":47.39,"location":2,"content":"So I'm a hearty welcome to CS224N and occasionally also"},{"from":47.39,"to":52.63,"location":2,"content":"known as Ling 284 which is Natural Language Processing with Deep Learning."},{"from":52.63,"to":55.42,"location":2,"content":"Um, as just a sort of a personal anecdote,"},{"from":55.42,"to":59.72,"location":2,"content":"is still sort of blows my mind that so many people turn up to this class these days."},{"from":59.72,"to":63.98,"location":2,"content":"So, for about the first decade that I taught NLP here,"},{"from":63.98,"to":68.18,"location":2,"content":"you know the number of people I got each year was approximately 45."},{"from":68.18,"to":71.24,"location":2,"content":"[LAUGHTER] So it's an order of [LAUGHTER] magnitude smaller than"},{"from":71.24,"to":74.36,"location":2,"content":"it is now but guess it says quite a lot"},{"from":74.36,"to":77.45,"location":2,"content":"on about what a revolutionary impact"},{"from":77.45,"to":80.87,"location":2,"content":"that artificial intelligence in general and machine learning,"},{"from":80.87,"to":85.6,"location":2,"content":"deep learning, NLP are starting to have in modern society."},{"from":85.6,"to":88.86,"location":2,"content":"Okay. So this is our plan for today."},{"from":88.86,"to":92.75,"location":2,"content":"So, um, um, we're really gonna get straight down to business today."},{"from":92.75,"to":97.97,"location":2,"content":"So they'll be a brief, very brief introduction some of the sort of course logistics,"},{"from":97.97,"to":102.14,"location":2,"content":"very brief discussion and talk about human language and"},{"from":102.14,"to":106.37,"location":2,"content":"word meaning and then we wanna get right into talking about um,"},{"from":106.37,"to":110.54,"location":2,"content":"the first thing that we're doing which is coming up with word vectors and looking"},{"from":110.54,"to":115.01,"location":2,"content":"at the word2vec algorithm and that will then sort of fill up the rest of the class."},{"from":115.01,"to":116.84,"location":2,"content":"There are still two seats right in"},{"from":116.84,"to":119.48,"location":2,"content":"the front row for someone who wants to sit right in front of me,"},{"from":119.48,"to":122.76,"location":2,"content":"just letting you know [LAUGHTER]."},{"from":122.76,"to":126.36,"location":2,"content":"Okay. Okay. So here are the course logistics in brief."},{"from":126.36,"to":128.34,"location":2,"content":"So I'm Christopher Manning,"},{"from":128.34,"to":135.29,"location":2,"content":"the person who bravely became the head TA is Abigail See is right there."},{"from":135.29,"to":138.92,"location":2,"content":"And then we have quite a lot of wonderful TA's."},{"from":138.92,"to":142.7,"location":2,"content":"To the people who are wonderful TA's just sort of stand up for one moment."},{"from":142.7,"to":146.81,"location":2,"content":"So, um, [LAUGHTER] we have some sense for wonderful TAs."},{"from":146.81,"to":148.9,"location":2,"content":"[LAUGHTER] Okay great."},{"from":148.9,"to":151.32,"location":2,"content":"Um, okay."},{"from":151.32,"to":153.26,"location":2,"content":"So you know when the lecture is because you made it"},{"from":153.26,"to":157.1,"location":2,"content":"here and so welcome also to SCPD people."},{"from":157.1,"to":161.3,"location":2,"content":"This is also an SCPD class and you can watch it on video."},{"from":161.3,"to":164.3,"location":2,"content":"But we love for Stanford students to turn"},{"from":164.3,"to":167.3,"location":2,"content":"up and show their beautiful faces in the classroom."},{"from":167.3,"to":172.81,"location":2,"content":"Okay. So, um, the web-page has all the info about syllabus et cetera et cetera."},{"from":172.81,"to":176.18,"location":2,"content":"Okay. So this class what do we hope to teach?"},{"from":176.18,"to":179.24,"location":2,"content":"So, one thing that we wanna teach is, uh, you know,"},{"from":179.24,"to":182.5,"location":2,"content":"an understanding of effective modern methods for deep learning."},{"from":182.5,"to":185.09,"location":2,"content":"Starting off by reviewing some of the basics and then"},{"from":185.09,"to":188.78,"location":2,"content":"particularly talking about the kinds of techniques including um,"},{"from":188.78,"to":191.45,"location":2,"content":"recurrent networks and attention that are widely"},{"from":191.45,"to":194.57,"location":2,"content":"used for natural language processing models."},{"from":194.57,"to":198.77,"location":2,"content":"A second thing we wanna teach is a big picture understanding of"},{"from":198.77,"to":203.07,"location":2,"content":"human languages and some of the difficulties in understanding and producing them."},{"from":203.07,"to":205.49,"location":2,"content":"Of course if you wanna know a lot about human languages,"},{"from":205.49,"to":209.06,"location":2,"content":"there's a whole linguistics department and you can do a lot of courses of that."},{"from":209.06,"to":213.59,"location":2,"content":"Um, but so I wanna give at least some appreciation so you have some clue of what are"},{"from":213.59,"to":218.24,"location":2,"content":"the challenges and difficulties and varieties of human languages."},{"from":218.24,"to":221.31,"location":2,"content":"And then this is also kind of a practical class."},{"from":221.31,"to":224.96,"location":2,"content":"Like we actually wanna teach you how you can"},{"from":224.96,"to":229.67,"location":2,"content":"build practical systems that work for some of the major parts of NLP."},{"from":229.67,"to":233.75,"location":2,"content":"So if you go and get a job at one of those tech firms and they say \"Hey,"},{"from":233.75,"to":235.79,"location":2,"content":"could you build us a named entity recognizer?\""},{"from":235.79,"to":238.13,"location":2,"content":"You can say \"Sure, I can do that.\""},{"from":238.13,"to":240.53,"location":2,"content":"And so for a bunch of problems,"},{"from":240.53,"to":242.09,"location":2,"content":"obviously we can't do everything,"},{"from":242.09,"to":243.23,"location":2,"content":"we're gonna do word meaning,"},{"from":243.23,"to":247.58,"location":2,"content":"dependency parsing, machine translation and you have an option to do question answering,"},{"from":247.58,"to":250.34,"location":2,"content":"I'm actually building systems for those."},{"from":250.34,"to":255.08,"location":2,"content":"If you'd been talking to friends who did the class in the last couple of years,"},{"from":255.08,"to":258.86,"location":2,"content":"um, here are the differences for this year just to get things straight."},{"from":258.86,"to":261.83,"location":2,"content":"Um, so we've updated some of the content of the course."},{"from":261.83,"to":266.29,"location":2,"content":"So, uh, between me and guest lectures there's new content."},{"from":266.29,"to":269.03,"location":2,"content":"Well that look bad."},{"from":269.03,"to":272.5,"location":2,"content":"Wonder if that will keep happening, we'll find out."},{"from":272.5,"to":278.17,"location":2,"content":"There's new content and on various topics that are sort of developing areas."},{"from":278.17,"to":281.3,"location":2,"content":"One of the problems with this course is really big area of deep learning at"},{"from":281.3,"to":284.75,"location":2,"content":"the moment is still just developing really really quickly."},{"from":284.75,"to":287.48,"location":2,"content":"So, it's sort of seems like one-year-old content is already"},{"from":287.48,"to":291.29,"location":2,"content":"things kind of data and we're trying to update things."},{"from":291.29,"to":294.14,"location":2,"content":"A big change that we're making this year is we're"},{"from":294.14,"to":296.93,"location":2,"content":"having five-one week assignments instead of"},{"from":296.93,"to":299.45,"location":2,"content":"three-two week assignments at the beginning of"},{"from":299.45,"to":302.8,"location":2,"content":"the course and I'll say a bit more about that in a minute."},{"from":302.8,"to":306.21,"location":2,"content":"Um, this year we're gonna use PyTorch instead of TensorFlow,"},{"from":306.21,"to":308.86,"location":2,"content":"and we'll talk about that more later too."},{"from":308.86,"to":313.88,"location":2,"content":"Um, we're having the assignments due before class on either Tuesday or Thursday."},{"from":313.88,"to":316.58,"location":2,"content":"So you're not distracted and can come to class."},{"from":316.58,"to":320.36,"location":2,"content":"So starting off, um, yeah."},{"from":320.36,"to":322.68,"location":2,"content":"So we're trying to give an easier,"},{"from":322.68,"to":326.51,"location":2,"content":"gentler ramp-up but on the other hand a fast ramp-up."},{"from":326.51,"to":329.56,"location":2,"content":"So we've got this first assignment which is sort of easy, uh,"},{"from":329.56,"to":334.04,"location":2,"content":"but it's available right now and is due next Tuesday."},{"from":334.04,"to":337.46,"location":2,"content":"And the final thing is we're not having a midterm this year."},{"from":337.46,"to":339.39,"location":2,"content":"Um, okay."},{"from":339.39,"to":340.79,"location":2,"content":"So this is what we're doing."},{"from":340.79,"to":344.3,"location":2,"content":"So there are five of these assignments that I just mentioned."},{"from":344.3,"to":346.34,"location":2,"content":"Um, So six percent for the first one,"},{"from":346.34,"to":349.09,"location":2,"content":"12 percent for each of the other ones,"},{"from":349.09,"to":352.19,"location":2,"content":"um, and, I already said that."},{"from":352.19,"to":354.23,"location":2,"content":"We're gonna use gradescope for grading."},{"from":354.23,"to":356.78,"location":2,"content":"It'll be really help out the TAs if you could use"},{"from":356.78,"to":361.01,"location":2,"content":"your SUnet ID as your gradescope account ID."},{"from":361.01,"to":364.2,"location":2,"content":"Um, so then for the second part of the course,"},{"from":364.2,"to":368.8,"location":2,"content":"people do a final project and there are two choices for the final project."},{"from":368.8,"to":372.08,"location":2,"content":"You can either do our default final project,"},{"from":372.08,"to":374.03,"location":2,"content":"which is a good option for many people,"},{"from":374.03,"to":375.89,"location":2,"content":"or you can do a custom final project and I'll"},{"from":375.89,"to":379.01,"location":2,"content":"talk about that in the more in the beginning."},{"from":379.01,"to":381.2,"location":2,"content":"This is not working right."},{"from":381.2,"to":385.13,"location":2,"content":"Um, and so then at the end we have"},{"from":385.13,"to":390.43,"location":2,"content":"a final poster presentation session at which your attendance is expected,"},{"from":390.43,"to":394.58,"location":2,"content":"and we're gonna be having that Wednesday in the evening."},{"from":394.58,"to":397.46,"location":2,"content":"Probably not quite five hours but it'll be within that window,"},{"from":397.46,"to":399.49,"location":2,"content":"we'll work out the details in a bit."},{"from":399.49,"to":401.51,"location":2,"content":"Three percent for participation,"},{"from":401.51,"to":403.39,"location":2,"content":"see the website for details."},{"from":403.39,"to":405.88,"location":2,"content":"Six late days, um,"},{"from":405.88,"to":410.33,"location":2,"content":"collaboration, like always in computer science classes,"},{"from":410.33,"to":415.34,"location":2,"content":"we want you to do your own work and not borrow stuff from other people's Githubs and"},{"from":415.34,"to":417.65,"location":2,"content":"so we really do emphasize that you should"},{"from":417.65,"to":421.15,"location":2,"content":"read and pay attention to collaboration policies."},{"from":421.15,"to":424.7,"location":2,"content":"Okay. So here's the high level plan for the problem sets."},{"from":424.7,"to":427.79,"location":2,"content":"So, homework one available right now,"},{"from":427.79,"to":430.13,"location":2,"content":"is a hopefully easy on ramp."},{"from":430.13,"to":431.72,"location":2,"content":"That's on iPython notebook,"},{"from":431.72,"to":433.56,"location":2,"content":"just help get everyone up to speed."},{"from":433.56,"to":437.75,"location":2,"content":"Homework two is pure Python plus numpy but that"},{"from":437.75,"to":442.19,"location":2,"content":"will start to kind of teach you more about the sort of underlying,"},{"from":442.19,"to":444.26,"location":2,"content":"how do we do deep learning."},{"from":444.26,"to":449.43,"location":2,"content":"If you're not so good or a bit rusty or never seen um,"},{"from":449.43,"to":451.15,"location":2,"content":"Python or numpy, um,"},{"from":451.15,"to":454.73,"location":2,"content":"we're gonna have an extra section on Friday."},{"from":454.73,"to":458.21,"location":2,"content":"So Friday from 1:30 to 2:50 um,"},{"from":458.21,"to":462.71,"location":2,"content":"in Skilling Auditorium, we'll have a section that's a Python review."},{"from":462.71,"to":464.61,"location":2,"content":"That's our only plan section at the moment,"},{"from":464.61,"to":466.6,"location":2,"content":"we're not gonna have a regular section."},{"from":466.6,"to":469.55,"location":2,"content":"Um, so encourage to go to that and that will also be"},{"from":469.55,"to":473.51,"location":2,"content":"recorded for SCPD and available for video as well."},{"from":473.51,"to":476.79,"location":2,"content":"Um, then Homework three um,"},{"from":476.79,"to":480.85,"location":2,"content":"will start us on using PyTorch."},{"from":480.85,"to":484.76,"location":2,"content":"And then homeworks four and five we're then gonna be using"},{"from":484.76,"to":488.72,"location":2,"content":"py- PyTorch on GPU and we're actually gonna be using"},{"from":488.72,"to":493.52,"location":2,"content":"Microsoft Azure with big thank yous to the kind Microsoft Azure people who have"},{"from":493.52,"to":499.17,"location":2,"content":"sponsored our GPU computing for the last um, three years."},{"from":499.17,"to":505,"location":2,"content":"Um, yes. So basically I mean all of modern deep learning has moved to the use"},{"from":505,"to":510.59,"location":2,"content":"of one or other of the large deep learning libraries like PyTorch TensorFlow,"},{"from":510.59,"to":512.21,"location":2,"content":"Chainer or MXNet um,"},{"from":512.21,"to":516.44,"location":2,"content":"et cetera and then doing the computing on GPU."},{"from":516.44,"to":518.6,"location":2,"content":"So of course since we're in the one building,"},{"from":518.6,"to":520.45,"location":2,"content":"we should of course be using, um,"},{"from":520.45,"to":522.62,"location":2,"content":"GPUs [LAUGHTER] but I mean in general"},{"from":522.62,"to":528.83,"location":2,"content":"the so parallelisms scalability of GPUs is what's powered most of modern deep learning."},{"from":528.83,"to":530.72,"location":2,"content":"Okay. The final project."},{"from":530.72,"to":535.46,"location":2,"content":"So for the final project there are two things that you can do."},{"from":535.46,"to":540.66,"location":2,"content":"So we have a default final project which is essentially our final project in a box."},{"from":540.66,"to":546.22,"location":2,"content":"And so this is building a question answering system and we do it over the squad dataset."},{"from":546.22,"to":551.45,"location":2,"content":"So what you build and how you can improve your performance is completely up to you."},{"from":551.45,"to":554.48,"location":2,"content":"It is open-ended but it has an easier start,"},{"from":554.48,"to":556.91,"location":2,"content":"a clearly defined objective and we can"},{"from":556.91,"to":559.77,"location":2,"content":"have a leaderboard for how well things are working."},{"from":559.77,"to":564.68,"location":2,"content":"Um, so if you don't have a clear research objective that can be a good choice for you"},{"from":564.68,"to":569.6,"location":2,"content":"or you can propose the custom Final Project and assuming it's sensible,"},{"from":569.6,"to":572.54,"location":2,"content":"we will approve your custom final project,"},{"from":572.54,"to":574.19,"location":2,"content":"we will give you feedback, um,"},{"from":574.19,"to":576.75,"location":2,"content":"form someone as a mentor, um,"},{"from":576.75,"to":582.41,"location":2,"content":"and either way for only the final project we allow teams of one, two or three."},{"from":582.41,"to":585.2,"location":2,"content":"For the homework should expect it to do them yourself."},{"from":585.2,"to":590.02,"location":2,"content":"Of course you can chat to people in a general way about the problems."},{"from":590.02,"to":593.01,"location":2,"content":"Okay. So that is the course."},{"from":593.01,"to":595.7,"location":2,"content":"All good, and not even behind schedule yet."},{"from":595.7,"to":601.73,"location":2,"content":"Okay. So the next section is human language and word meaning.Um."},{"from":601.73,"to":604.75,"location":2,"content":"You know, if I was um,"},{"from":604.75,"to":610.26,"location":2,"content":"really going to tell you a lot about human language that would take a lot of time um,"},{"from":610.26,"to":612.11,"location":2,"content":"which I don't really have here."},{"from":612.11,"to":614.01,"location":2,"content":"So I'm just going to tell you um,"},{"from":614.01,"to":616.65,"location":2,"content":"two anecdotes about human language."},{"from":616.65,"to":619.97,"location":2,"content":"And the first is this XKCD cartoon."},{"from":619.97,"to":622.52,"location":2,"content":"Um, and I mean this isn't,"},{"from":622.52,"to":626.05,"location":2,"content":"and I don't know why that's happening."},{"from":626.05,"to":628.25,"location":2,"content":"I'm not sure what to make of that."},{"from":628.25,"to":634.07,"location":2,"content":"Um, so, I actually really liked this XKCD cartoon."},{"from":634.07,"to":637.31,"location":2,"content":"It's not one of the classic ones that you see most often around the place,"},{"from":637.31,"to":642.14,"location":2,"content":"but I actually think it says a lot about language and is worth thinking about."},{"from":642.14,"to":645.65,"location":2,"content":"Like I think a lot of the time for the kind of people who come"},{"from":645.65,"to":649.38,"location":2,"content":"to this class who are mainly people like CS people,"},{"from":649.38,"to":651.95,"location":2,"content":"and EE people and random others."},{"from":651.95,"to":655.25,"location":2,"content":"There's some other people I know since these people linguists and so on around."},{"from":655.25,"to":657.05,"location":2,"content":"But for a lot of those people like,"},{"from":657.05,"to":661.61,"location":2,"content":"you've sort of spent your life looking at formal languages and the impression"},{"from":661.61,"to":666.18,"location":2,"content":"is that sort of human language as a sort of somehow a little bit broken formal languages,"},{"from":666.18,"to":668.57,"location":2,"content":"but there's really a lot more to it than that, right?"},{"from":668.57,"to":671.16,"location":2,"content":"That language is this amazing um,"},{"from":671.16,"to":675.11,"location":2,"content":"human created system that is used for"},{"from":675.11,"to":679.52,"location":2,"content":"all sorts of purposes and is adaptable to all sorts of purposes."},{"from":679.52,"to":683.75,"location":2,"content":"So you can do everything from describing mathematics and human language"},{"from":683.75,"to":688.52,"location":2,"content":"um to sort of nuzzling up to your best friend and getting them to understand you better."},{"from":688.52,"to":691.91,"location":2,"content":"So there's actually an amazing thing of human language. Anyway, I'll just read it."},{"from":691.91,"to":694.65,"location":2,"content":"Um, so it's the first person,"},{"from":694.65,"to":696.18,"location":2,"content":"the dark haired person says,"},{"from":696.18,"to":698.11,"location":2,"content":"\"Anyway, I could care less.\""},{"from":698.11,"to":700.01,"location":2,"content":"And her friend says,"},{"from":700.01,"to":702.44,"location":2,"content":"\"I think you mean you couldn't care less.\""},{"from":702.44,"to":706.49,"location":2,"content":"Saying you could care less implies you care at least some amount."},{"from":706.49,"to":709.77,"location":2,"content":"And the dark haired person says, \"I don't know,"},{"from":709.77,"to":714.59,"location":2,"content":"we're these unbelievably complicated brains drifting through a void trying"},{"from":714.59,"to":719.63,"location":2,"content":"in vain to connect with one another by blindly flinging words out into the darkness.\""},{"from":719.63,"to":722.72,"location":2,"content":"Every choice of phrasing and spelling, and tone,"},{"from":722.72,"to":727.77,"location":2,"content":"and timing carries countless signals and contexts and subtexts and more."},{"from":727.77,"to":731.43,"location":2,"content":"And every listener interprets those signals in their own way."},{"from":731.43,"to":733.57,"location":2,"content":"Language isn't a formal system,"},{"from":733.57,"to":736.24,"location":2,"content":"language is glorious chaos."},{"from":736.24,"to":740.75,"location":2,"content":"You can never know for sure what any words will mean to anyone."},{"from":740.75,"to":746.15,"location":2,"content":"All you can do is try to get better at guessing how your words affect people so"},{"from":746.15,"to":748.79,"location":2,"content":"you can have a chance of finding the ones that will make"},{"from":748.79,"to":751.79,"location":2,"content":"them feel something like what you want them to feel."},{"from":751.79,"to":754.24,"location":2,"content":"Everything else is pointless."},{"from":754.24,"to":757.39,"location":2,"content":"I assume you're giving me tips on how you interpret"},{"from":757.39,"to":761.07,"location":2,"content":"words because you want me to feel less alone."},{"from":761.07,"to":763.51,"location":2,"content":"If so, thank you."},{"from":763.51,"to":765.59,"location":2,"content":"That means a lot."},{"from":765.59,"to":768.44,"location":2,"content":"But if you're just running my sentences past"},{"from":768.44,"to":771.78,"location":2,"content":"some mental checklist so you can show off how well you know it,"},{"from":771.78,"to":773.18,"location":2,"content":"then I could care less."},{"from":773.18,"to":782.83,"location":2,"content":"[NOISE] Um, and so I think um,"},{"from":782.83,"to":787.79,"location":2,"content":"I think actually this has some nice messages about how language is this uncertain"},{"from":787.79,"to":793.34,"location":2,"content":"evolved system of communication but somehow we have enough agreed meaning that you know,"},{"from":793.34,"to":795.5,"location":2,"content":"we can kind of pretty much communicate."},{"from":795.5,"to":796.87,"location":2,"content":"But we're doing some kind of you know"},{"from":796.87,"to":800.54,"location":2,"content":"probabilistic inference of guessing what people mean and we're"},{"from":800.54,"to":802.07,"location":2,"content":"using language not just for"},{"from":802.07,"to":806.2,"location":2,"content":"the information functions but for the social functions etc etc."},{"from":806.2,"to":813.49,"location":2,"content":"Okay. And then here's my one other thought I had review about language."},{"from":813.49,"to":820.57,"location":2,"content":"So, essentially if we want to have artificial intelligence that's intelligent,"},{"from":820.57,"to":823.94,"location":2,"content":"what we need to somehow get to the point of having"},{"from":823.94,"to":828.56,"location":2,"content":"compu- computers that have the knowledge of human beings, right?"},{"from":828.56,"to":832.43,"location":2,"content":"Because human beings have knowledge that gives them intelligence."},{"from":832.43,"to":835.46,"location":2,"content":"And if you think about how we sort of"},{"from":835.46,"to":839.27,"location":2,"content":"convey knowledge around the place in our human world,"},{"from":839.27,"to":844.02,"location":2,"content":"mainly the way we do it is through human language."},{"from":844.02,"to":846.41,"location":2,"content":"You know, some kinds of knowledge you can sort of"},{"from":846.41,"to":849.26,"location":2,"content":"work out for yourself by doing physical stuff right,"},{"from":849.26,"to":851.9,"location":2,"content":"I can hold this and drop that and I've learnt something."},{"from":851.9,"to":853.76,"location":2,"content":"So I have to learn a bit of knowledge there."},{"from":853.76,"to":857.18,"location":2,"content":"But sort of most of the knowledge in your heads and why you're sitting in"},{"from":857.18,"to":861.98,"location":2,"content":"this classroom has come from people communicating in human language to you."},{"from":861.98,"to":864.26,"location":2,"content":"Um, so one of the famous,"},{"from":864.26,"to":866.99,"location":2,"content":"most famous steep learning people Yann Le Cun,"},{"from":866.99,"to":869.16,"location":2,"content":"he likes to say this line about,"},{"from":869.16,"to":873.38,"location":2,"content":"oh, you know really I think that you know there's not much difference"},{"from":873.38,"to":877.97,"location":2,"content":"between the intelligence of human being and orangutan."},{"from":877.97,"to":880.51,"location":2,"content":"And I actually think he's really wrong on that."},{"from":880.51,"to":882.79,"location":2,"content":"Like the sense in which he means that is,"},{"from":882.79,"to":885.84,"location":2,"content":"an orangutan has a really good vision system."},{"from":885.84,"to":888.61,"location":2,"content":"Orangutans have very good you know control of"},{"from":888.61,"to":892.06,"location":2,"content":"their arms just like human beings for picking things up."},{"from":892.06,"to":898.97,"location":2,"content":"Orangutans um can use tools um and orangutans can make plans so"},{"from":898.97,"to":902.27,"location":2,"content":"that if you sort of put the food somewhere where they have to sort of move"},{"from":902.27,"to":905.96,"location":2,"content":"the plank to get to the island with the food they can do a plan like that."},{"from":905.96,"to":909.89,"location":2,"content":"So yeah, in a sense they've got a fair bit of intelligence but you know,"},{"from":909.89,"to":913.38,"location":2,"content":"sort of orangutans just aren't like human beings."},{"from":913.38,"to":916.1,"location":2,"content":"And why aren't they like human beings?"},{"from":916.1,"to":921.61,"location":2,"content":"And I'd like to suggest to you the reason for that is what human beings have achieved is,"},{"from":921.61,"to":925.07,"location":2,"content":"we don't just have sort of one computer like"},{"from":925.07,"to":929.83,"location":2,"content":"a you know dusty old IBM PC in your mother's garage."},{"from":929.83,"to":933.74,"location":2,"content":"What we have is a human computer network."},{"from":933.74,"to":937.52,"location":2,"content":"And the way that we've achieved that human computer network is that,"},{"from":937.52,"to":941.28,"location":2,"content":"we use human languages as our networking language."},{"from":941.28,"to":944.69,"location":2,"content":"Um, and so, when you think about it um,"},{"from":944.69,"to":951.82,"location":2,"content":"so on any kind of evolutionary scale language is super super super super recent, right?"},{"from":951.82,"to":957.47,"location":2,"content":"That um, creatures have had vision for people don't quite know but you know,"},{"from":957.47,"to":960.98,"location":2,"content":"maybe it's 75 million years or maybe it's longer, right?"},{"from":960.98,"to":963.85,"location":2,"content":"A huge length of time."},{"from":963.85,"to":967.29,"location":2,"content":"How long have human beings have had language?"},{"from":967.29,"to":969.86,"location":2,"content":"You know people don't know that either because it turns out you know,"},{"from":969.86,"to":971.01,"location":2,"content":"when you have fossils,"},{"from":971.01,"to":973.49,"location":2,"content":"you can't knock the skull on the side and say,"},{"from":973.49,"to":975.05,"location":2,"content":"do you not have language."},{"from":975.05,"to":979.1,"location":2,"content":"Um, but you know, most people estimate that sort of language is"},{"from":979.1,"to":985.99,"location":2,"content":"a very recent invention before current human beings moved out of um, out of Africa."},{"from":985.99,"to":988.55,"location":2,"content":"So that many people think that we've only had language for"},{"from":988.55,"to":991.46,"location":2,"content":"something like a 100,000 years or something like that."},{"from":991.46,"to":995.45,"location":2,"content":"So that's sort of you know blink of an eye on the evolutionary timescale."},{"from":995.45,"to":999.74,"location":2,"content":"But you know, it was the development of language [inaudible]"},{"from":999.74,"to":1003.97,"location":2,"content":"that sort of made human beings invisible- [NOISE] in invincible, right?"},{"from":1003.97,"to":1006.48,"location":2,"content":"It wasn't that, human beings um,"},{"from":1006.48,"to":1011.41,"location":2,"content":"developed poison fangs or developed ability to run"},{"from":1011.41,"to":1013.66,"location":2,"content":"faster than any other creature or"},{"from":1013.66,"to":1016.21,"location":2,"content":"put a big horn on their heads or something like that, right?"},{"from":1016.21,"to":1019.06,"location":2,"content":"You know, humans are basically pretty puny um,"},{"from":1019.06,"to":1021.19,"location":2,"content":"but they had this um,"},{"from":1021.19,"to":1024.31,"location":2,"content":"unbeatable advantage that they could communicate with"},{"from":1024.31,"to":1027.88,"location":2,"content":"each other and therefore work much more effectively in teams."},{"from":1027.88,"to":1031.49,"location":2,"content":"And that sort of basically made human beings invincible."},{"from":1031.49,"to":1035.58,"location":2,"content":"But you know, even then humans were kind of limited, right?"},{"from":1035.58,"to":1038.14,"location":2,"content":"That kind of got you to about the Stone Age right,"},{"from":1038.14,"to":1040.39,"location":2,"content":"where you could bang on your stones and with"},{"from":1040.39,"to":1043.24,"location":2,"content":"the right kind of stone make something sharp to cut with."},{"from":1043.24,"to":1045.68,"location":2,"content":"Um, what got humans beyond that,"},{"from":1045.68,"to":1048.1,"location":2,"content":"was that they invented writing."},{"from":1048.1,"to":1052.91,"location":2,"content":"So writing was then an ability where you could take knowledge"},{"from":1052.91,"to":1057.73,"location":2,"content":"not only communicated um mouth to mouth to people that you saw."},{"from":1057.73,"to":1061.66,"location":2,"content":"You could put it down on your piece of papyrus so your clay tablet or whatever"},{"from":1061.66,"to":1065.62,"location":2,"content":"it was at first and that knowledge could then be sent places."},{"from":1065.62,"to":1070.27,"location":2,"content":"It could be sent spatially around the world and it could then"},{"from":1070.27,"to":1075.43,"location":2,"content":"be sent temporally through time."},{"from":1075.43,"to":1077.29,"location":2,"content":"And well, how old is writing?"},{"from":1077.29,"to":1080.89,"location":2,"content":"I mean, we sort of basically know about how old writing is, right?"},{"from":1080.89,"to":1084.12,"location":2,"content":"That writing is about 5,000 years old."},{"from":1084.12,"to":1089.74,"location":2,"content":"It's incredibly incredibly recent on this scale of evolution but you know,"},{"from":1089.74,"to":1096.73,"location":2,"content":"essentially writing was so powerful as a way of having knowledge that then in those 5,000"},{"from":1096.73,"to":1104.04,"location":2,"content":"years that enabled human beings to go from stone age sharp piece or flint to you know,"},{"from":1104.04,"to":1106.24,"location":2,"content":"having iPhones and all of these things,"},{"from":1106.24,"to":1108.79,"location":2,"content":"all these incredibly sophisticated devices."},{"from":1108.79,"to":1112.96,"location":2,"content":"So, language is pretty special thing I'd like to suggest."},{"from":1112.96,"to":1117.91,"location":2,"content":"Um, but you know, if I go back to my analogy that sort of it's allowed humans to"},{"from":1117.91,"to":1123.28,"location":2,"content":"construct a networked computer that is way way more powerful than um,"},{"from":1123.28,"to":1127.6,"location":2,"content":"just having individual creatures as sort of intelligent like an orangutan."},{"from":1127.6,"to":1130.53,"location":2,"content":"Um, and you compare it to our computer networks,"},{"from":1130.53,"to":1133.05,"location":2,"content":"it's a really funny kind of network, right?"},{"from":1133.05,"to":1135.74,"location":2,"content":"You know that these days um,"},{"from":1135.74,"to":1141.81,"location":2,"content":"we have networks that run around where we have sort of large network bandwidth, right?"},{"from":1141.81,"to":1143.77,"location":2,"content":"You know, we might be frustrated sometimes with"},{"from":1143.77,"to":1146.53,"location":2,"content":"our Netflix downloads but by and large you know,"},{"from":1146.53,"to":1149.76,"location":2,"content":"we can download hundreds of megabytes really easily and quickly."},{"from":1149.76,"to":1151.57,"location":2,"content":"And we don't think that's fast enough,"},{"from":1151.57,"to":1153.67,"location":2,"content":"so we're going to be rolling out 5G networks."},{"from":1153.67,"to":1156.4,"location":2,"content":"So it's an order of magnitude faster again."},{"from":1156.4,"to":1158.8,"location":2,"content":"I mean, by comparison to that, I mean,"},{"from":1158.8,"to":1163.54,"location":2,"content":"human language is a pathetically slow network, right?"},{"from":1163.54,"to":1169.46,"location":2,"content":"That the amount of information you can convey by human language is very slow."},{"from":1169.46,"to":1173.95,"location":2,"content":"I mean you know, whatever it is I sort of speak at about 15 words a second right,"},{"from":1173.95,"to":1175.42,"location":2,"content":"you can start doing um,"},{"from":1175.42,"to":1177.55,"location":2,"content":"your information theory if you know some right?"},{"from":1177.55,"to":1181.06,"location":2,"content":"But um, you don't actually get much bandwidth at all."},{"from":1181.06,"to":1184.4,"location":2,"content":"And that then leads- so you can think of,"},{"from":1184.4,"to":1185.98,"location":2,"content":"how does it work then?"},{"from":1185.98,"to":1187.57,"location":2,"content":"So, humans have come up with"},{"from":1187.57,"to":1193.39,"location":2,"content":"this incredibly impressive system which is essentially form of compression."},{"from":1193.39,"to":1196.12,"location":2,"content":"Sort of a very adaptive form of compression,"},{"from":1196.12,"to":1198.07,"location":2,"content":"so that when we're talking to people,"},{"from":1198.07,"to":1202.87,"location":2,"content":"we assume that they have an enormous amount of knowledge in their heads which"},{"from":1202.87,"to":1207.64,"location":2,"content":"isn't the same as but it's broadly similar to mine when I'm talking to you right?"},{"from":1207.64,"to":1210.57,"location":2,"content":"That you know what English words mean,"},{"from":1210.57,"to":1213.85,"location":2,"content":"and you know a lot about how the wor- world works."},{"from":1213.85,"to":1217.15,"location":2,"content":"And therefore, I can say a short message and communicate"},{"from":1217.15,"to":1222.82,"location":2,"content":"only a relatively short bit string and you can actually understand a lot. All right?"},{"from":1222.82,"to":1226.03,"location":2,"content":"So, I can say sort of whatever you know,"},{"from":1226.03,"to":1228.85,"location":2,"content":"imagine a busy shopping mall and that"},{"from":1228.85,"to":1231.63,"location":2,"content":"there are two guys standing in front of a makeup counter,"},{"from":1231.63,"to":1236.29,"location":2,"content":"and you know I've only said whatever that was sort of about 200 bits of"},{"from":1236.29,"to":1238.96,"location":2,"content":"information but that's enabled you to construct"},{"from":1238.96,"to":1242.34,"location":2,"content":"a whole visual scene that we're taking megabytes to um,"},{"from":1242.34,"to":1244.38,"location":2,"content":"represent as an image."},{"from":1244.38,"to":1246.63,"location":2,"content":"So, that's why language is good."},{"from":1246.63,"to":1249.1,"location":2,"content":"Um, so from that more authorial level,"},{"from":1249.1,"to":1251.42,"location":2,"content":"I'll now move back to the concrete stuff."},{"from":1251.42,"to":1255.92,"location":2,"content":"What we wanna do in this class is not solve the whole of language,"},{"from":1255.92,"to":1257.95,"location":2,"content":"but we want to represent, um,"},{"from":1257.95,"to":1260.38,"location":2,"content":"the meaning of words, right?"},{"from":1260.38,"to":1263.23,"location":2,"content":"So, a lot of language is bound up in words and their meanings"},{"from":1263.23,"to":1266.2,"location":2,"content":"and words can have really rich meanings, right?"},{"from":1266.2,"to":1267.97,"location":2,"content":"As soon as you say a word teacher,"},{"from":1267.97,"to":1272.53,"location":2,"content":"that's kinda quite a lot of rich meaning or you can have actions that have rich meaning."},{"from":1272.53,"to":1277.22,"location":2,"content":"So, if I say a word like prognosticate or,"},{"from":1277.22,"to":1279.07,"location":2,"content":"um, total or something you know,"},{"from":1279.07,"to":1282.38,"location":2,"content":"these words that have rich meanings and a lot of nuance on them."},{"from":1282.38,"to":1284.39,"location":2,"content":"And so we wanna represent meaning."},{"from":1284.39,"to":1286.51,"location":2,"content":"And so, the question is what is meaning?"},{"from":1286.51,"to":1289.36,"location":2,"content":"So, you can of course you can- dictionaries are meant to tell you about meanings."},{"from":1289.36,"to":1291.49,"location":2,"content":"So, you can look up dictionaries um,"},{"from":1291.49,"to":1295.72,"location":2,"content":"and Webster says sort of tries to relate meaning to idea."},{"from":1295.72,"to":1299.52,"location":2,"content":"The idea that is represented by a word or a phrase."},{"from":1299.52,"to":1304.24,"location":2,"content":"The idea that a person wants to express by word signs et cetera."},{"from":1304.24,"to":1306.19,"location":2,"content":"I mean, you know,"},{"from":1306.19,"to":1309.73,"location":2,"content":"you could think that these definitions are kind of a cop-out because it seems"},{"from":1309.73,"to":1313.02,"location":2,"content":"like they're rewriting meaning in terms of the word idea,"},{"from":1313.02,"to":1315.04,"location":2,"content":"and is that really gotten you anywhere."},{"from":1315.04,"to":1318.37,"location":2,"content":"Um, how do linguists think about meaning?"},{"from":1318.37,"to":1323.11,"location":2,"content":"I mean, the most common way that linguists have thought about"},{"from":1323.11,"to":1325.66,"location":2,"content":"meaning is an idea that's called denotational"},{"from":1325.66,"to":1328.42,"location":2,"content":"semantics which is also used in programming languages."},{"from":1328.42,"to":1334.81,"location":2,"content":"So, the idea of that is we think of meaning as what things represent."},{"from":1334.81,"to":1336.95,"location":2,"content":"So, if I say the word chair,"},{"from":1336.95,"to":1341.14,"location":2,"content":"the denotation of the word chair includes this one here and that one,"},{"from":1341.14,"to":1342.33,"location":2,"content":"that one, that one, that one."},{"from":1342.33,"to":1344.92,"location":2,"content":"And so, the word chair is sort of representing"},{"from":1344.92,"to":1348.58,"location":2,"content":"all the things that are chairs and you can sort of, um,"},{"from":1348.58,"to":1353.41,"location":2,"content":"you can then think of something like running as well that you know there's sort of sets"},{"from":1353.41,"to":1357.98,"location":2,"content":"of actions that people can partake that- that's their denotation."},{"from":1357.98,"to":1362.2,"location":2,"content":"And that's sort of what you most commonly see in philosophy or linguistics as denotation."},{"from":1362.2,"to":1367.13,"location":2,"content":"It's kind of a hard thing to get your hands on, um, computationally."},{"from":1367.13,"to":1370.48,"location":2,"content":"So, um, what type of people most commonly"},{"from":1370.48,"to":1374.02,"location":2,"content":"do or use the most commonly do I guess I should say now"},{"from":1374.02,"to":1377.53,"location":2,"content":"for working out the meaning of words on the computer that"},{"from":1377.53,"to":1381.12,"location":2,"content":"commonly that turn to something that was a bit like a dictionary."},{"from":1381.12,"to":1386.2,"location":2,"content":"In particular favorite online thing was this online thesaurus called WordNet which"},{"from":1386.2,"to":1391.51,"location":2,"content":"sort of tells you about word meanings and relationships between word meanings."},{"from":1391.51,"to":1396.44,"location":2,"content":"Um, so this is just giving you the very slices sense of,"},{"from":1396.44,"to":1399.82,"location":2,"content":"um, of what's in WordNet."},{"from":1399.82,"to":1404.48,"location":2,"content":"Um, so this is an actual bit of Python code up there which you can,"},{"from":1404.48,"to":1408.37,"location":2,"content":"um, type into your computer and run and do this for yourself."},{"from":1408.37,"to":1411.04,"location":2,"content":"Um, so this uses a thing called NLTK."},{"from":1411.04,"to":1413.72,"location":2,"content":"Um, so NLTK is sort of like"},{"from":1413.72,"to":1419.36,"location":2,"content":"the \"Swiss Army Knife of NLP\" meaning that it's not terribly good for anything,"},{"from":1419.36,"to":1421.57,"location":2,"content":"but it has a lot of basic tools."},{"from":1421.57,"to":1426.46,"location":2,"content":"So, if you wanted to do something like just get some stuff out of WordNet and show it,"},{"from":1426.46,"to":1429.63,"location":2,"content":"it's the perfect thing to use. Um, okay."},{"from":1429.63,"to":1434.83,"location":2,"content":"So, um, from NLTK I'm importing WordNet and so then I can say,"},{"from":1434.83,"to":1441.36,"location":2,"content":"\"Okay, um, for the word good tell me about the synonym sets with good participates in.\""},{"from":1441.36,"to":1443.44,"location":2,"content":"And there's good goodness as a noun."},{"from":1443.44,"to":1444.76,"location":2,"content":"There is an adjective good."},{"from":1444.76,"to":1448.33,"location":2,"content":"There's one estimable good, honorable, respectable."},{"from":1448.33,"to":1451.15,"location":2,"content":"Um, this looks really complex and hard to understand."},{"from":1451.15,"to":1453.7,"location":2,"content":"But the idea of word- WordNet makes"},{"from":1453.7,"to":1458.08,"location":2,"content":"these very fine grain distinctions between senses of a word."},{"from":1458.08,"to":1460.67,"location":2,"content":"So, what sort of saying for good, um,"},{"from":1460.67,"to":1463.57,"location":2,"content":"there's what some sensors where it's a noun, right?"},{"from":1463.57,"to":1464.76,"location":2,"content":"That's where you sort of,"},{"from":1464.76,"to":1467.2,"location":2,"content":"I bought some goods for my trip, right?"},{"from":1467.2,"to":1468.88,"location":2,"content":"So, that's sort of, um,"},{"from":1468.88,"to":1472.78,"location":2,"content":"one of these noun sensors like this one I guess."},{"from":1472.78,"to":1475.48,"location":2,"content":"Um, then there are adjective sensors and it's trying to"},{"from":1475.48,"to":1478.84,"location":2,"content":"distinguish- there's a basic adjective sense of good being good,"},{"from":1478.84,"to":1481.27,"location":2,"content":"and then in certain, um, sensors,"},{"from":1481.27,"to":1484.75,"location":2,"content":"there are these extended sensors of good in different directions."},{"from":1484.75,"to":1488.52,"location":2,"content":"So, I guess this is good in the sense of beneficial, um,"},{"from":1488.52,"to":1492.92,"location":2,"content":"and this one is sort of person who is respectable or something."},{"from":1492.92,"to":1495.58,"location":2,"content":"He's a good man or something like that, right?"},{"from":1495.58,"to":1496.86,"location":2,"content":"So, um, but you know,"},{"from":1496.86,"to":1499.66,"location":2,"content":"part of what's kind of makes us"},{"from":1499.66,"to":1502.63,"location":2,"content":"think very problematic and practice to use is it tries to make"},{"from":1502.63,"to":1506.85,"location":2,"content":"all these very fine-grain differences between sensors that are a human being can"},{"from":1506.85,"to":1511.41,"location":2,"content":"barely understand the difference between them um, and relate to."},{"from":1511.41,"to":1513.69,"location":2,"content":"Um, so you can then do other things with WordNet."},{"from":1513.69,"to":1518.46,"location":2,"content":"So, this bit of code you can sort of well walk up and is a kind of hierarchy."},{"from":1518.46,"to":1521.63,"location":2,"content":"So, it's kinda like a traditional, um, database."},{"from":1521.63,"to":1529.03,"location":2,"content":"So, if I start with a panda and say- [NOISE] if I start with a panda."},{"from":1529.03,"to":1532.18,"location":2,"content":"Um, and walk up, um,"},{"from":1532.18,"to":1535.33,"location":2,"content":"the pandas are [inaudible]."},{"from":1535.33,"to":1537.64,"location":2,"content":"Maybe you'd guys to bio which are carnivores,"},{"from":1537.64,"to":1539.55,"location":2,"content":"placentals, mammals, blah, blah, blah."},{"from":1539.55,"to":1544.13,"location":2,"content":"Okay, so, um, that's the kind of stuff you can get out to- out of WordNet."},{"from":1544.13,"to":1547.11,"location":2,"content":"Um, you know, in practice WordNet has been."},{"from":1547.11,"to":1549.58,"location":2,"content":"Everyone sort of used to use it because it gave"},{"from":1549.58,"to":1551.99,"location":2,"content":"you some sort of sense of the meaning of the word."},{"from":1551.99,"to":1554.13,"location":2,"content":"But you know it's also sort of well-known."},{"from":1554.13,"to":1556.54,"location":2,"content":"It never worked that well."},{"from":1556.54,"to":1562.72,"location":2,"content":"Um, so you know that sort of the synonym sets miss a lot of nuance."},{"from":1562.72,"to":1565.27,"location":2,"content":"So, you know one of the synonym sets for good has"},{"from":1565.27,"to":1568.24,"location":2,"content":"proficient in it and good sort of like proficient"},{"from":1568.24,"to":1571.49,"location":2,"content":"but doesn't proficient have some more connotations and nuance?"},{"from":1571.49,"to":1573.25,"location":2,"content":"I think it does."},{"from":1573.25,"to":1578.08,"location":2,"content":"Um, WordNet like most hand built resources is sort of very incomplete."},{"from":1578.08,"to":1581.29,"location":2,"content":"So, as soon as you're coming to new meanings of words,"},{"from":1581.29,"to":1583.7,"location":2,"content":"or new words and slang words,"},{"from":1583.7,"to":1585.31,"location":2,"content":"well then, that gives you nothing."},{"from":1585.31,"to":1588.98,"location":2,"content":"Um, it's sort of built with human labor,"},{"from":1588.98,"to":1595.03,"location":2,"content":"um, in ways that you know it's hard to sort of create and adapt."},{"from":1595.03,"to":1597.67,"location":2,"content":"And in particular, what we want to focus on is,"},{"from":1597.67,"to":1601.87,"location":2,"content":"seems like a basic thing you'd like to do with words and it's actually at least"},{"from":1601.87,"to":1605.92,"location":2,"content":"understand similarities and relations between the meaning of words."},{"from":1605.92,"to":1609.52,"location":2,"content":"And it turns out that you know WordNet doesn't actually do that that well"},{"from":1609.52,"to":1613.6,"location":2,"content":"because it just has these sort of fixed discrete synonym sets."},{"from":1613.6,"to":1616.09,"location":2,"content":"So, if you have a words in a synonym said that there's"},{"from":1616.09,"to":1619.08,"location":2,"content":"sort of a synonym and maybe not exactly the same meaning,"},{"from":1619.08,"to":1620.8,"location":2,"content":"they're not in the same synonyms set,"},{"from":1620.8,"to":1624.58,"location":2,"content":"you kind of can't really measure the partial resemblance as a meaning for them."},{"from":1624.58,"to":1628.43,"location":2,"content":"So, if something like good and marvelous aren't in the same synonym set,"},{"from":1628.43,"to":1631.96,"location":2,"content":"but there's something that they share in common that you'd like to represent."},{"from":1631.96,"to":1636.88,"location":2,"content":"Okay. So, um, that's kinda turn to lead into"},{"from":1636.88,"to":1641.93,"location":2,"content":"us wanting to do something different and better for word meaning."},{"from":1641.93,"to":1645.73,"location":2,"content":"And, um, before getting there I just sort of wanna again sort"},{"from":1645.73,"to":1649.49,"location":2,"content":"of build a little from traditional NLP."},{"from":1649.49,"to":1653.28,"location":2,"content":"So, traditional NLP in the context of this course sort of means"},{"from":1653.28,"to":1659.28,"location":2,"content":"Natural Language Processing up until approximately 2012."},{"from":1659.28,"to":1663.64,"location":2,"content":"There were some earlier antecedents but as basically, um,"},{"from":1663.64,"to":1667.6,"location":2,"content":"in 2013 that things really began to change with"},{"from":1667.6,"to":1673.06,"location":2,"content":"people starting to use neural net style representations for natural language processing."},{"from":1673.06,"to":1675.43,"location":2,"content":"So, up until 2012,"},{"from":1675.43,"to":1678.06,"location":2,"content":"um, standardly you know we had words."},{"from":1678.06,"to":1682.21,"location":2,"content":"They are just words. So, we had hotel conference motel."},{"from":1682.21,"to":1686.65,"location":2,"content":"They were words, and we'd have you know lexicons and put words into our model."},{"from":1686.65,"to":1692.29,"location":2,"content":"Um, and in neural networks land this is referred to as a localist representation."},{"from":1692.29,"to":1694.96,"location":2,"content":"I'll come back to those terms again next time."},{"from":1694.96,"to":1700.02,"location":2,"content":"But that's sort of meaning that for any concept there's sort of one particular,"},{"from":1700.02,"to":1704.08,"location":2,"content":"um, place which is the word hotel or the word motel."},{"from":1704.08,"to":1706.46,"location":2,"content":"A way of thinking about that is to think"},{"from":1706.46,"to":1709.62,"location":2,"content":"about what happens when you build a machine learning model."},{"from":1709.62,"to":1714.76,"location":2,"content":"So, if you have a categorical variable like you have words with the choice of word"},{"from":1714.76,"to":1720.13,"location":2,"content":"and you want to stick that into some kind of classifier in a Machine Learning Model,"},{"from":1720.13,"to":1722.9,"location":2,"content":"somehow you have to code that categorical variable,"},{"from":1722.9,"to":1726.55,"location":2,"content":"and the standard way of doing it is that you code it by having"},{"from":1726.55,"to":1731.28,"location":2,"content":"different levels of the variable which means that you have a vector,"},{"from":1731.28,"to":1733.84,"location":2,"content":"and you have, this is the word house."},{"from":1733.84,"to":1735.67,"location":2,"content":"This is the word cat. This is the word dog."},{"from":1735.67,"to":1737.02,"location":2,"content":"This is the word some chairs."},{"from":1737.02,"to":1738.19,"location":2,"content":"This is the word agreeable."},{"from":1738.19,"to":1739.46,"location":2,"content":"This is the word something else."},{"from":1739.46,"to":1741.41,"location":2,"content":"This is the word, um,"},{"from":1741.41,"to":1745.75,"location":2,"content":"hotel, um, and this is another word for something different, right?"},{"from":1745.75,"to":1748.08,"location":2,"content":"So that you have put a one at the position"},{"from":1748.08,"to":1751.12,"location":2,"content":"and neural net land we call these one-hot vectors,"},{"from":1751.12,"to":1752.47,"location":2,"content":"and so these might be, ah,"},{"from":1752.47,"to":1756.25,"location":2,"content":"one-hot vectors for hotel and motel."},{"from":1756.25,"to":1759.04,"location":2,"content":"So, there are a couple of things that are bad here."},{"from":1759.04,"to":1761.01,"location":2,"content":"Um, the one that's sort of, ah,"},{"from":1761.01,"to":1767.14,"location":2,"content":"practical nuisance is you know languages have a lot of words."},{"from":1767.14,"to":1770.59,"location":2,"content":"Ah, so, it's sort of one of those dictionaries that you might have still had in"},{"from":1770.59,"to":1775.45,"location":2,"content":"school that you probably have about 250,000 words in them."},{"from":1775.45,"to":1777.4,"location":2,"content":"But you know, if you start getting into"},{"from":1777.4,"to":1781.86,"location":2,"content":"more technical and scientific English it's easy to get to a million words."},{"from":1781.86,"to":1785.69,"location":2,"content":"I mean, actually the number of words that you have in a language, um,"},{"from":1785.69,"to":1788.62,"location":2,"content":"like English is actually infinite because we have"},{"from":1788.62,"to":1792.22,"location":2,"content":"these processes which are called derivational morphology,"},{"from":1792.22,"to":1796.93,"location":2,"content":"um, where you can make more words by adding endings onto existing words."},{"from":1796.93,"to":1799.66,"location":2,"content":"So, you know you can start with something like paternalist,"},{"from":1799.66,"to":1803.47,"location":2,"content":"fatherly, and then you can sort of say from maternal,"},{"from":1803.47,"to":1806.28,"location":2,"content":"you can say paternalist, or paternalistic,"},{"from":1806.28,"to":1810.07,"location":2,"content":"paternalism and pa- I did it paternalistically."},{"from":1810.07,"to":1814.26,"location":2,"content":"Right? Now all of these ways that you can bake bigger words by adding more stuff into it."},{"from":1814.26,"to":1818.9,"location":2,"content":"Um, and so really you end up with an infinite space of words."},{"from":1818.9,"to":1822.88,"location":2,"content":"Um, yeah. So that's a minor problem, right?"},{"from":1822.88,"to":1828.28,"location":2,"content":"We have very big vectors if we want to represent a sensible size vocabulary."},{"from":1828.28,"to":1831.99,"location":2,"content":"Um, but there's a much bigger problem than that, which is, well,"},{"from":1831.99,"to":1835.2,"location":2,"content":"precisely what we want to do all the time, is we want to,"},{"from":1835.2,"to":1838.59,"location":2,"content":"sort of, understand relationships and the meaning of words."},{"from":1838.59,"to":1842.38,"location":2,"content":"So, you know, an obvious example of this is web search."},{"from":1842.38,"to":1845.35,"location":2,"content":"So, if I do a search for Seattle motel,"},{"from":1845.35,"to":1848.71,"location":2,"content":"it'd be useful if it also showed me results that had"},{"from":1848.71,"to":1852.65,"location":2,"content":"Seattle hotel on the page and vice versa because,"},{"from":1852.65,"to":1855.41,"location":2,"content":"you know, hotels and motels pretty much the same thing."},{"from":1855.41,"to":1859.9,"location":2,"content":"Um, but, you know, if we have these one-hot vectors like we had before they have"},{"from":1859.9,"to":1864.25,"location":2,"content":"no s- similarity relationship between them, right?"},{"from":1864.25,"to":1865.67,"location":2,"content":"So, in math terms,"},{"from":1865.67,"to":1867.78,"location":2,"content":"these two vectors are orthogonal."},{"from":1867.78,"to":1870.87,"location":2,"content":"No similarity relationship between them."},{"from":1870.87,"to":1872.65,"location":2,"content":"Um, and so you,"},{"from":1872.65,"to":1874.7,"location":2,"content":"kind of, get nowhere."},{"from":1874.7,"to":1876.88,"location":2,"content":"Now, you know, there are things that you could do,"},{"from":1876.88,"to":1878.71,"location":2,"content":"I- I just showed you WordNet's."},{"from":1878.71,"to":1880.84,"location":2,"content":"WordNet's shows you some synonyms and stuff."},{"from":1880.84,"to":1882.61,"location":2,"content":"So that might help a bit."},{"from":1882.61,"to":1884.04,"location":2,"content":"There are other things you could do."},{"from":1884.04,"to":1885.41,"location":2,"content":"You could sort of say, well wait,"},{"from":1885.41,"to":1889.64,"location":2,"content":"why don't we just build up a big table where we have a big table of,"},{"from":1889.64,"to":1892.67,"location":2,"content":"um, word similarities, and we could work with that."},{"from":1892.67,"to":1894.91,"location":2,"content":"And, you know, people used to try and do that, right?"},{"from":1894.91,"to":1899.77,"location":2,"content":"You know, that's sort of what Google did in 2005 or something."},{"from":1899.77,"to":1902.08,"location":2,"content":"You know, it had word similarity tables."},{"from":1902.08,"to":1904.51,"location":2,"content":"The problem with doing that is you know,"},{"from":1904.51,"to":1908.29,"location":2,"content":"we were talking about how maybe we want 500,000 words."},{"from":1908.29,"to":1912.04,"location":2,"content":"And if you want to build up then a word similarity table out"},{"from":1912.04,"to":1916.06,"location":2,"content":"of our pairs of words from one-hot representations,"},{"from":1916.06,"to":1918.64,"location":2,"content":"um, you- that means that the size of that table,"},{"from":1918.64,"to":1920.38,"location":2,"content":"as my math is pretty bad,"},{"from":1920.38,"to":1922.32,"location":2,"content":"is it 2.5 trillion?"},{"from":1922.32,"to":1927.13,"location":2,"content":"It's some very big number of cells in your similarity, um, matrix."},{"from":1927.13,"to":1929.23,"location":2,"content":"So that's almost impossible to do."},{"from":1929.23,"to":1933.71,"location":2,"content":"So, what we're gonna instead do is explore a method in which,"},{"from":1933.71,"to":1936.67,"location":2,"content":"um, we are going to represent words as vectors,"},{"from":1936.67,"to":1938.14,"location":2,"content":"in a way I'll show you just, um,"},{"from":1938.14,"to":1941.77,"location":2,"content":"a minute in such a way that just the representation of"},{"from":1941.77,"to":1946.48,"location":2,"content":"a word gives you their similarity with no further work."},{"from":1946.48,"to":1950.63,"location":2,"content":"Okay. And so that's gonna lead into these different ideas."},{"from":1950.63,"to":1954.17,"location":2,"content":"So, I mentioned before denotational semantics."},{"from":1954.17,"to":1959.12,"location":2,"content":"Here's another idea for representing the meaning of words,"},{"from":1959.12,"to":1961.98,"location":2,"content":"um, which is called distributional semantics."},{"from":1961.98,"to":1965.14,"location":2,"content":"And so the idea of distributional semantics is, well,"},{"from":1965.14,"to":1970.9,"location":2,"content":"how are we going to represent the meaning of a word is by looking at the contexts,"},{"from":1970.9,"to":1972.92,"location":2,"content":"um, in which it appears."},{"from":1972.92,"to":1976.51,"location":2,"content":"So, this is a picture of JR Firth who was a British linguist."},{"from":1976.51,"to":1978.4,"location":2,"content":"Um, he's famous for this saying,"},{"from":1978.4,"to":1981.54,"location":2,"content":"\"You shall know a word by the company it keeps.\""},{"from":1981.54,"to":1986.95,"location":2,"content":"Um, but another person who's very famous for developing this notion of meaning is, um,"},{"from":1986.95,"to":1990.67,"location":2,"content":"the philosopher Ludwig- Ludwig Wittgenstein in his later writings,"},{"from":1990.67,"to":1993.44,"location":2,"content":"which he referred to as a use theory of meeting- meaning."},{"from":1993.44,"to":1996.07,"location":2,"content":"Well, actually he's- he used some big German word that I don't know,"},{"from":1996.07,"to":1998.53,"location":2,"content":"but, um, we'll call it a use theory of meaning."},{"from":1998.53,"to":2002.54,"location":2,"content":"And, you know, essentially the point was, well, you know,"},{"from":2002.54,"to":2006.78,"location":2,"content":"if you can explain every- if- if you can"},{"from":2006.78,"to":2011.16,"location":2,"content":"explain what contexts it's correct to use a certain word,"},{"from":2011.16,"to":2014.6,"location":2,"content":"versus in what contexts would be the wrong word to use,"},{"from":2014.6,"to":2018.13,"location":2,"content":"this maybe gives you bad memories of doing English in high school,"},{"from":2018.13,"to":2020.49,"location":2,"content":"when people said, ah, that's the wrong word to use there,"},{"from":2020.49,"to":2023.2,"location":2,"content":"um, well, then you understand the meaning of the word, right?"},{"from":2023.2,"to":2027.05,"location":2,"content":"Um, and so that's the idea of distributional semantics."},{"from":2027.05,"to":2029.79,"location":2,"content":"And it's been- so one of the most successful ideas in"},{"from":2029.79,"to":2034.01,"location":2,"content":"modern statistical NLP because it gives you a great way to learn about word meaning."},{"from":2034.01,"to":2036.62,"location":2,"content":"And so what we're gonna do is we're going to say,"},{"from":2036.62,"to":2038.92,"location":2,"content":"haha, I want to know what the word banking means."},{"from":2038.92,"to":2041.73,"location":2,"content":"So, I'm gonna grab a lot of texts,"},{"from":2041.73,"to":2044.52,"location":2,"content":"which is easy to do now when we have the World Wide Web,"},{"from":2044.52,"to":2047.95,"location":2,"content":"I'll find lots of sentences where the word banking is used,"},{"from":2047.95,"to":2052.77,"location":2,"content":"Government debt problems turning into banking crises as happened in 2009."},{"from":2052.77,"to":2055.84,"location":2,"content":"And both these- I'm just going to say all of"},{"from":2055.84,"to":2059.11,"location":2,"content":"this stuff is the meaning of the word banking."},{"from":2059.11,"to":2063.75,"location":2,"content":"Um, that those are the contexts in which the word banking is used."},{"from":2063.75,"to":2069.49,"location":2,"content":"And that seems like very simple and perhaps even not quite right idea,"},{"from":2069.49,"to":2074.88,"location":2,"content":"but it turns out to be a very usable idea that does a great job at capturing meaning."},{"from":2074.88,"to":2078.3,"location":2,"content":"And so what we're gonna do is say rather than"},{"from":2078.3,"to":2082.95,"location":2,"content":"our old localist representation we're now gonna"},{"from":2082.95,"to":2088.22,"location":2,"content":"represent words in what we call a distributed representation."},{"from":2088.22,"to":2091.83,"location":2,"content":"And so, for the distributed representation we're still going"},{"from":2091.83,"to":2095.66,"location":2,"content":"to [NOISE] represent the meaning of a word as a numeric vector."},{"from":2095.66,"to":2099.48,"location":2,"content":"But now we're going to say that the meaning of each word is,"},{"from":2099.48,"to":2101.52,"location":2,"content":"ah, smallish vector, um,"},{"from":2101.52,"to":2107.76,"location":2,"content":"but it's going to be a dense vector where by all of the numbers are non-zero."},{"from":2107.76,"to":2110.01,"location":2,"content":"So the meaning of banking is going to be"},{"from":2110.01,"to":2113.34,"location":2,"content":"distributed over the dim- dimensions of this vector."},{"from":2113.34,"to":2119.19,"location":2,"content":"Um, now, my vector here is of dimension nine because I want to keep the slide, um, nice."},{"from":2119.19,"to":2123.2,"location":2,"content":"Um, life isn't quite that good in practice."},{"from":2123.2,"to":2125.97,"location":2,"content":"When we do this we use a larger dimensionality,"},{"from":2125.97,"to":2129.07,"location":2,"content":"kinda, solid the minimum that people use is 50."},{"from":2129.07,"to":2132.33,"location":2,"content":"Um, a typical number that you might use on your laptop is"},{"from":2132.33,"to":2135.95,"location":2,"content":"300 if you want to really max out performance,"},{"from":2135.95,"to":2138.89,"location":2,"content":"um, maybe 1,000, 2,000, 4,000."},{"from":2138.89,"to":2142.02,"location":2,"content":"But, you know, nevertheless [NOISE] orders of magnitude is"},{"from":2142.02,"to":2146.81,"location":2,"content":"smaller compared to a length 500,000 vector."},{"from":2146.81,"to":2151.89,"location":2,"content":"Okay. So we have words with their vector representations."},{"from":2151.89,"to":2155.79,"location":2,"content":"And so since each word is going to have a vector, um,"},{"from":2155.79,"to":2161.16,"location":2,"content":"representation we then have a vector space in which we can place all of the words."},{"from":2161.16,"to":2163.98,"location":2,"content":"Um, and that's completely unreadable, um,"},{"from":2163.98,"to":2168.14,"location":2,"content":"but if you zoom into the vector space it's still completely unreadable."},{"from":2168.14,"to":2170.11,"location":2,"content":"But if you zoom in a bit further,"},{"from":2170.11,"to":2173.1,"location":2,"content":"um, you can find different parts of this space."},{"from":2173.1,"to":2176.82,"location":2,"content":"So here's the part that where countries attending to,"},{"from":2176.82,"to":2178.95,"location":2,"content":"um, exist Japanese, German,"},{"from":2178.95,"to":2181.95,"location":2,"content":"French, Russian, British Australian American,"},{"from":2181.95,"to":2185.13,"location":2,"content":"um, France, Britain, Germany et cetera."},{"from":2185.13,"to":2187.77,"location":2,"content":"And you can shift over to a different part of the space."},{"from":2187.77,"to":2191.04,"location":2,"content":"So here's a part of the space where various verbs are,"},{"from":2191.04,"to":2193.49,"location":2,"content":"so has have, had, been, be."},{"from":2193.49,"to":2200.88,"location":2,"content":"Oops. Um, um, [inaudible] be always was where."},{"from":2200.88,"to":2203.97,"location":2,"content":"You can even see that some morphological forms are grouping together,"},{"from":2203.97,"to":2206.1,"location":2,"content":"and things that sort of go together like say,"},{"from":2206.1,"to":2208.77,"location":2,"content":"think expect to things that take those, kind of, compliment."},{"from":2208.77,"to":2210.8,"location":2,"content":"He said or thought something."},{"from":2210.8,"to":2212.41,"location":2,"content":"Um, they group together."},{"from":2212.41,"to":2215.01,"location":2,"content":"Now, what am I actually showing you here?"},{"from":2215.01,"to":2217.76,"location":2,"content":"Um, you know, really this was built from,"},{"from":2217.76,"to":2220.57,"location":2,"content":"ah, 100 dimensional word vectors."},{"from":2220.57,"to":2225.63,"location":2,"content":"And there is this problem is really hard to visualize 100 dimensional word vectors."},{"from":2225.63,"to":2229.86,"location":2,"content":"So, what is actually happening here is these, um,"},{"from":2229.86,"to":2235.11,"location":2,"content":"100 dimensional word vectors are being projected down into two-dimensions,"},{"from":2235.11,"to":2237.99,"location":2,"content":"and you're so- seeing the two-dimensional view,"},{"from":2237.99,"to":2239.79,"location":2,"content":"which I'll get back to later."},{"from":2239.79,"to":2242.4,"location":2,"content":"Um, so, on the one hand, um,"},{"from":2242.4,"to":2244.41,"location":2,"content":"whenever you see these pictures you should hold on to"},{"from":2244.41,"to":2246.84,"location":2,"content":"the your wallet because there's a huge amount of"},{"from":2246.84,"to":2251.53,"location":2,"content":"detail on the original vector space that got completely killed and went away, um,"},{"from":2251.53,"to":2252.84,"location":2,"content":"in the 2D projection,"},{"from":2252.84,"to":2257.07,"location":2,"content":"and indeed some of what push things together in the 2D,"},{"from":2257.07,"to":2259.88,"location":2,"content":"um, projection may really, really,"},{"from":2259.88,"to":2262.59,"location":2,"content":"really misrepresent what's in the original space."},{"from":2262.59,"to":2265.74,"location":2,"content":"Um, but even looking at these 2D representations,"},{"from":2265.74,"to":2266.85,"location":2,"content":"the overall feeling is,"},{"from":2266.85,"to":2268.92,"location":2,"content":"my gosh this actually sort of works, doesn't it?"},{"from":2268.92,"to":2274.36,"location":2,"content":"Um, we can sort of see similarities, um, between words."},{"from":2274.36,"to":2282.38,"location":2,"content":"Okay. So, um, ha- so that was the idea of what we want to do."},{"from":2282.38,"to":2284.31,"location":2,"content":"Um, the next part, um,"},{"from":2284.31,"to":2287.94,"location":2,"content":"is then how do we actually go about doing it?"},{"from":2287.94,"to":2290.45,"location":2,"content":"I'll pause for breath for half a minute."},{"from":2290.45,"to":2292.71,"location":2,"content":"Has anyone got a question they're dying to ask?"},{"from":2292.71,"to":2300.3,"location":2,"content":"[NOISE] Yeah."},{"from":2300.3,"to":2306.72,"location":2,"content":"Where were the- the vectors is each, um,"},{"from":2306.72,"to":2308.46,"location":2,"content":"had a different order in each contact,"},{"from":2308.46,"to":2310.53,"location":2,"content":"like, say the first decimal vector,"},{"from":2310.53,"to":2312.84,"location":2,"content":"second decimal vector, are those standard"},{"from":2312.84,"to":2315.47,"location":2,"content":"across all theory or people choose them themselves?"},{"from":2315.47,"to":2322.34,"location":2,"content":"Um, they're not standards across NLP um and they're not chosen at all."},{"from":2322.34,"to":2325.05,"location":2,"content":"So what we're gonna present is a learning algorithm."},{"from":2325.05,"to":2328.43,"location":2,"content":"So where we just sort of shuffle in lots of text"},{"from":2328.43,"to":2331.97,"location":2,"content":"and miraculously these word vectors come out."},{"from":2331.97,"to":2337.76,"location":2,"content":"And so the l- learning algorithm itself decides the dimensions."},{"from":2337.76,"to":2343.09,"location":2,"content":"But um that actually reminds me of something I sort of meant to say which was yeah,"},{"from":2343.09,"to":2345.43,"location":2,"content":"I mean, since this is a vector space,"},{"from":2345.43,"to":2349.58,"location":2,"content":"in some sense the dimensions over the arbitrary right,"},{"from":2349.58,"to":2352.57,"location":2,"content":"because you can you know just have your basis vectors in"},{"from":2352.57,"to":2355.95,"location":2,"content":"any different direction and you could sort of re-represent,"},{"from":2355.95,"to":2359.72,"location":2,"content":"um the words in the vector space with a different set of basics,"},{"from":2359.72,"to":2362.93,"location":2,"content":"basis vectors and it'd be exactly the same vector space"},{"from":2362.93,"to":2366.38,"location":2,"content":"just sort of rotate around to your new um, vectors."},{"from":2366.38,"to":2370.58,"location":2,"content":"So, you know, you shouldn't read too much into the sort of elements."},{"from":2370.58,"to":2372.86,"location":2,"content":"So, it actually turns out that because of the way a lot of"},{"from":2372.86,"to":2376.07,"location":2,"content":"deep learning um operations work,"},{"from":2376.07,"to":2378.17,"location":2,"content":"some things they do, do element-wise."},{"from":2378.17,"to":2382.78,"location":2,"content":"So that the dimensions do actually tend to get some meaning to them it turns out."},{"from":2382.78,"to":2386.9,"location":2,"content":"But um, though I think I really wanted to say was,"},{"from":2386.9,"to":2392.24,"location":2,"content":"that you know one thing we can just think of is how close things"},{"from":2392.24,"to":2394.25,"location":2,"content":"are in the vector space and that's"},{"from":2394.25,"to":2397.8,"location":2,"content":"a notion of meaning similarity that we are going to exploit."},{"from":2397.8,"to":2400.64,"location":2,"content":"But you might hope that you get more than that,"},{"from":2400.64,"to":2403.01,"location":2,"content":"and you might actually think that there's meaning in"},{"from":2403.01,"to":2406.93,"location":2,"content":"different dimensions and directions in the word vector space."},{"from":2406.93,"to":2411.34,"location":2,"content":"And the answer to that is there is and I'll come back to that a bit later."},{"from":2411.34,"to":2417.77,"location":2,"content":"Okay. Um, so in some sense this thing that had"},{"from":2417.77,"to":2422.24,"location":2,"content":"the biggest impact um in sort of turning the world of"},{"from":2422.24,"to":2427.63,"location":2,"content":"NLP in a neural networks direction was that picture."},{"from":2427.63,"to":2432.26,"location":2,"content":"Um, was this um algorithm that um"},{"from":2432.26,"to":2437.34,"location":2,"content":"Thomas Mikolov came up with in 2013 called the word2vec algorithm."},{"from":2437.34,"to":2443.21,"location":2,"content":"So it wasn't the first work and having distributed representations of words."},{"from":2443.21,"to":2445.73,"location":2,"content":"So there was older work from Yoshua Bengio that went"},{"from":2445.73,"to":2448.37,"location":2,"content":"back to about the sort of turn on the millennium,"},{"from":2448.37,"to":2452.78,"location":2,"content":"that somehow it's sort of hadn't really sort of hit the world over their head and had"},{"from":2452.78,"to":2457.73,"location":2,"content":"a huge impact and has really sort of Thomas Mikolov showed this very simple,"},{"from":2457.73,"to":2460.07,"location":2,"content":"very scalable way of learning"},{"from":2460.07,"to":2465.01,"location":2,"content":"vector representations of um words and that sort of really opened the flood gates."},{"from":2465.01,"to":2468.65,"location":2,"content":"And so that's the algorithm that I'm going to um show now."},{"from":2468.65,"to":2475.78,"location":2,"content":"Okay. So the idea of this algorithm is you start with a big pile of text."},{"from":2475.78,"to":2480.65,"location":2,"content":"Um, so wherever you find you know web pages on newspaper articles or something,"},{"from":2480.65,"to":2482.48,"location":2,"content":"a lot of continuous text, right?"},{"from":2482.48,"to":2486.35,"location":2,"content":"Actual sentences because we want to learn wo- word meaning context."},{"from":2486.35,"to":2492.47,"location":2,"content":"Um, NLP people call a large pile of text a corpus."},{"from":2492.47,"to":2495.89,"location":2,"content":"And I mean that's just the Latin word for body, right?"},{"from":2495.89,"to":2497.91,"location":2,"content":"It's a body of text."},{"from":2497.91,"to":2503.22,"location":2,"content":"Important things to note if you want to seem really educated is in Latin,"},{"from":2503.22,"to":2506.69,"location":2,"content":"this is a fourth declensions noun."},{"from":2506.69,"to":2509.9,"location":2,"content":"So the plural of corpus is corpora."},{"from":2509.9,"to":2511.19,"location":2,"content":"And whereas if you say"},{"from":2511.19,"to":2515.39,"location":2,"content":"core Pi everyone will know that you didn't study Latin in high school."},{"from":2515.39,"to":2520.49,"location":2,"content":"[LAUGHTER] Um, okay."},{"from":2520.49,"to":2526.46,"location":2,"content":"Um, so right- so we then want to say that every word um"},{"from":2526.46,"to":2528.89,"location":2,"content":"in a- in a fixed vocabulary which would just be"},{"from":2528.89,"to":2532.45,"location":2,"content":"the vocabulary the corpus is um represented by a vector."},{"from":2532.45,"to":2536.61,"location":2,"content":"And we just start those vectors off as random vectors."},{"from":2536.61,"to":2538.34,"location":2,"content":"And so then what we're going to do is do"},{"from":2538.34,"to":2542.59,"location":2,"content":"this big iterative algorithm where we go through each position in the text."},{"from":2542.59,"to":2544.72,"location":2,"content":"We say, here's a word in the text."},{"from":2544.72,"to":2550.52,"location":2,"content":"Let's look at the words around it and what we're going to want to do is say well,"},{"from":2550.52,"to":2552.89,"location":2,"content":"the meaning of a word is its contexts of use."},{"from":2552.89,"to":2555.29,"location":2,"content":"So we want the representation of the word"},{"from":2555.29,"to":2557.87,"location":2,"content":"in the middle to be able to predict the words that are"},{"from":2557.87,"to":2563.72,"location":2,"content":"around it and so we're gonna achieve that by moving the position of the word vector."},{"from":2563.72,"to":2567.5,"location":2,"content":"And we just repeat that a billion times and"},{"from":2567.5,"to":2571.19,"location":2,"content":"somehow a miracle occurs and outcomes at the end we have"},{"from":2571.19,"to":2574.79,"location":2,"content":"a word vector space that looks like a picture I showed where it has"},{"from":2574.79,"to":2579.53,"location":2,"content":"a good meaning of word meet good representation of word meaning."},{"from":2579.53,"to":2583.09,"location":2,"content":"So slightly more, um,"},{"from":2583.09,"to":2587.24,"location":2,"content":"um, slightly more um graphically right."},{"from":2587.24,"to":2588.44,"location":2,"content":"So here's the situation."},{"from":2588.44,"to":2592.84,"location":2,"content":"So we've got part of our corpus problems turning into banking crisis,"},{"from":2592.84,"to":2594.29,"location":2,"content":"and so what we want to say is well,"},{"from":2594.29,"to":2597.72,"location":2,"content":"we want to know the meaning of the word into and so we're going to hope that"},{"from":2597.72,"to":2601.4,"location":2,"content":"its representation can be used in a way that'll"},{"from":2601.4,"to":2604.82,"location":2,"content":"make precise to predict what words appear in"},{"from":2604.82,"to":2608.6,"location":2,"content":"the context of into because that's the meaning of into."},{"from":2608.6,"to":2611.53,"location":2,"content":"And so we're going to try and make those predictions,"},{"from":2611.53,"to":2614.86,"location":2,"content":"see how well we can predict and then change"},{"from":2614.86,"to":2619.47,"location":2,"content":"the vector representations of words in a way that we can do that prediction better."},{"from":2619.47,"to":2621.32,"location":2,"content":"And then once we've dealt with into,"},{"from":2621.32,"to":2623.76,"location":2,"content":"we just go onto the next word and we say,"},{"from":2623.76,"to":2626.06,"location":2,"content":"okay, let's take banking as the word."},{"from":2626.06,"to":2629.8,"location":2,"content":"The meaning of banking is predicting the contexts in which banking occurs."},{"from":2629.8,"to":2631.26,"location":2,"content":"Here's one context."},{"from":2631.26,"to":2634.55,"location":2,"content":"Let's try and predict these words that occur around banking and"},{"from":2634.55,"to":2638.74,"location":2,"content":"see how we do and then we'll move on again from there."},{"from":2638.74,"to":2642.47,"location":2,"content":"Okay. Um, sounds easy so far."},{"from":2642.47,"to":2646.1,"location":2,"content":"Um, [NOISE] now we go on and sort of do a bit more stuff."},{"from":2646.1,"to":2652.46,"location":2,"content":"Okay. So overall, we have a big long corpus of capital T words."},{"from":2652.46,"to":2657.13,"location":2,"content":"So if we have a whole lot of documents we just concatenate them all together and we say,"},{"from":2657.13,"to":2659.01,"location":2,"content":"okay, here's a billion words,"},{"from":2659.01,"to":2661.74,"location":2,"content":"and so big long list of words."},{"from":2661.74,"to":2663.3,"location":2,"content":"And so what we're gonna do,"},{"from":2663.3,"to":2666.88,"location":2,"content":"is for the first um product we're going to sort of"},{"from":2666.88,"to":2670.95,"location":2,"content":"go through all the words and then for the second product,"},{"from":2670.95,"to":2674.63,"location":2,"content":"we're gonna say- we're gonna choose some fixed size window, you know,"},{"from":2674.63,"to":2677.99,"location":2,"content":"it might be five words on each side or something and we're going to try and"},{"from":2677.99,"to":2682.01,"location":2,"content":"predict the 10 words that are around that center word."},{"from":2682.01,"to":2684.2,"location":2,"content":"And we're going to predict in the sense of trying to"},{"from":2684.2,"to":2686.78,"location":2,"content":"predict that word given the center word."},{"from":2686.78,"to":2688.46,"location":2,"content":"That's our probability model."},{"from":2688.46,"to":2691.18,"location":2,"content":"And so if we multiply all those things together,"},{"from":2691.18,"to":2694.61,"location":2,"content":"that's our model likelihood is how good a job it"},{"from":2694.61,"to":2698.38,"location":2,"content":"does at predicting the words around every word."},{"from":2698.38,"to":2701.6,"location":2,"content":"And that model likelihood is going to depend"},{"from":2701.6,"to":2705.18,"location":2,"content":"on the parameters of our model which we write as theta."},{"from":2705.18,"to":2707.86,"location":2,"content":"And in this particular model,"},{"from":2707.86,"to":2710.69,"location":2,"content":"the only parameters in it is actually"},{"from":2710.69,"to":2713.81,"location":2,"content":"going to be the vector representations we give the words."},{"from":2713.81,"to":2716.95,"location":2,"content":"The model has absolutely no other parameters to it."},{"from":2716.95,"to":2720.05,"location":2,"content":"So, we're just going to say we're representing"},{"from":2720.05,"to":2723.7,"location":2,"content":"a word with a vector in a vector space and that"},{"from":2723.7,"to":2727.88,"location":2,"content":"representation of it is its meaning and we're then going to be able to"},{"from":2727.88,"to":2732.34,"location":2,"content":"use that to predict what other words occur in a way I'm about to show you."},{"from":2732.34,"to":2737.24,"location":2,"content":"Okay. So, um, that's our likelihood and so what we do in all of"},{"from":2737.24,"to":2742.28,"location":2,"content":"these models is we sort of define an objective function and then we're going to be,"},{"from":2742.28,"to":2745.88,"location":2,"content":"I want to come up with vector representations of words in"},{"from":2745.88,"to":2750.74,"location":2,"content":"such a way as to minimize our objective function."},{"from":2750.74,"to":2756.38,"location":2,"content":"Um, so objective function is basically the same as what's on the top half of the slide,"},{"from":2756.38,"to":2758.05,"location":2,"content":"but we change a couple of things."},{"from":2758.05,"to":2763.04,"location":2,"content":"We stick a minus sign in front of it so we can do minimization rather than maximization."},{"from":2763.04,"to":2765.51,"location":2,"content":"Completely arbitrary makes no difference."},{"from":2765.51,"to":2768.13,"location":2,"content":"Um, we stick a one and T in front of it,"},{"from":2768.13,"to":2771.8,"location":2,"content":"so that we're working out the sort of average"},{"from":2771.8,"to":2776.15,"location":2,"content":"as of a goodness of predicting for each choice of center word."},{"from":2776.15,"to":2779.36,"location":2,"content":"Again, that sort of makes no difference but it kinda keeps the scale of"},{"from":2779.36,"to":2783.09,"location":2,"content":"things ah not dependent on the size of the corpus."},{"from":2783.09,"to":2787.24,"location":2,"content":"Um, the bit that's actually important is we stick a log in front of"},{"from":2787.24,"to":2791.69,"location":2,"content":"the function that was up there um because it turns out that everything always gets nice."},{"from":2791.69,"to":2793.8,"location":2,"content":"So when you stick logs and find the products"},{"from":2793.8,"to":2796.37,"location":2,"content":"um when you're doing things like optimization."},{"from":2796.37,"to":2798.86,"location":2,"content":"So, when we do that we then got a log of"},{"from":2798.86,"to":2802.43,"location":2,"content":"all these products which will allow us to turn things you know,"},{"from":2802.43,"to":2806.3,"location":2,"content":"into a sums of the log of this probability"},{"from":2806.3,"to":2810.76,"location":2,"content":"and we'll go through that again um in just a minute."},{"from":2810.76,"to":2815.42,"location":2,"content":"Okay. Um, and so if we can mi- if we can change"},{"from":2815.42,"to":2820.86,"location":2,"content":"our vector representations of these words so as to minimize this J of theta,"},{"from":2820.86,"to":2826.54,"location":2,"content":"that means we'll be good at predicting words in the context of another word."},{"from":2826.54,"to":2830.45,"location":2,"content":"So then, that all sounded good but it was all"},{"from":2830.45,"to":2833.96,"location":2,"content":"dependent on having this probability function where you wanna"},{"from":2833.96,"to":2837.02,"location":2,"content":"predict the probability of a word in"},{"from":2837.02,"to":2840.64,"location":2,"content":"the context given the center word and the question is,"},{"from":2840.64,"to":2843.62,"location":2,"content":"how can you possibly do that?"},{"from":2843.62,"to":2848.39,"location":2,"content":"Um, well um, remember what I said is actually our model is just gonna"},{"from":2848.39,"to":2853.66,"location":2,"content":"have vector representations of words and that was the only parameters of the model."},{"from":2853.66,"to":2855.65,"location":2,"content":"Now, that's, that's almost true."},{"from":2855.65,"to":2857.11,"location":2,"content":"It's not quite true."},{"from":2857.11,"to":2859.22,"location":2,"content":"Um, we actually cheat slightly."},{"from":2859.22,"to":2862.4,"location":2,"content":"Since we actually propose two vector representations for"},{"from":2862.4,"to":2866.6,"location":2,"content":"each word and this makes it simpler to do this."},{"from":2866.6,"to":2868.07,"location":2,"content":"Um, you cannot do this,"},{"from":2868.07,"to":2870.62,"location":2,"content":"there are ways to get around it but this is the simplest way to do it."},{"from":2870.62,"to":2874.61,"location":2,"content":"So we have one vector for word when it's the center word that's predicting"},{"from":2874.61,"to":2879.5,"location":2,"content":"other words but we have a second vector for each word when it's a context word,"},{"from":2879.5,"to":2881.22,"location":2,"content":"so that's one of the words in context."},{"from":2881.22,"to":2882.68,"location":2,"content":"So for each word type,"},{"from":2882.68,"to":2886.85,"location":2,"content":"we have these two vectors as center word, as context word."},{"from":2886.85,"to":2892.7,"location":2,"content":"Um, so then we're gonna work out this probability of a word in the context,"},{"from":2892.7,"to":2894.57,"location":2,"content":"given the center word,"},{"from":2894.57,"to":2902.11,"location":2,"content":"purely in terms of these vectors and the way we do it is with this equation right here,"},{"from":2902.11,"to":2905.16,"location":2,"content":"which I'll explain more in just a moment."},{"from":2905.16,"to":2909.65,"location":2,"content":"So we're still on exactly the same situation, right?"},{"from":2909.65,"to":2912.05,"location":2,"content":"That we're wanting to work out probabilities of"},{"from":2912.05,"to":2915.66,"location":2,"content":"words occurring in the context of our center word."},{"from":2915.66,"to":2918.78,"location":2,"content":"So the center word is C and the context words represented with"},{"from":2918.78,"to":2922.37,"location":2,"content":"O and these [inaudible] slide notation but sort of,"},{"from":2922.37,"to":2924.89,"location":2,"content":"we're basically saying there's one kind of"},{"from":2924.89,"to":2927.59,"location":2,"content":"vector for center words is a different kind of vector"},{"from":2927.59,"to":2933.66,"location":2,"content":"for context words and we're gonna work out this probabilistic prediction um,"},{"from":2933.66,"to":2936.47,"location":2,"content":"in terms of these word vectors."},{"from":2936.47,"to":2939.26,"location":2,"content":"Okay. So how can we do that?"},{"from":2939.26,"to":2942.95,"location":2,"content":"Well, the way we do it is with this um,"},{"from":2942.95,"to":2947.87,"location":2,"content":"formula here which is the sort of shape that you see over and over again um,"},{"from":2947.87,"to":2950.3,"location":2,"content":"in deep learning with categorical staff."},{"from":2950.3,"to":2952.67,"location":2,"content":"So for the very center bit of it,"},{"from":2952.67,"to":2957.82,"location":2,"content":"the bit in orange are more the same thing occurs in the um, denominator."},{"from":2957.82,"to":2961.13,"location":2,"content":"What we're doing there is calculating a dot product."},{"from":2961.13,"to":2964.46,"location":2,"content":"So, we're gonna go through the components of our vector and we're gonna"},{"from":2964.46,"to":2968.75,"location":2,"content":"multiply them together and that means if um,"},{"from":2968.75,"to":2972.8,"location":2,"content":"different words have B components of the same sign,"},{"from":2972.8,"to":2975.62,"location":2,"content":"plus or minus, in the same positions,"},{"from":2975.62,"to":2978.92,"location":2,"content":"the dot product will be big and if"},{"from":2978.92,"to":2982.46,"location":2,"content":"they have different signs or one is big and one is small,"},{"from":2982.46,"to":2984.41,"location":2,"content":"the dot product will be a lot smaller."},{"from":2984.41,"to":2988.1,"location":2,"content":"So that orange part directly calculates uh,"},{"from":2988.1,"to":2991.67,"location":2,"content":"sort of a similarity between words where"},{"from":2991.67,"to":2995.34,"location":2,"content":"the similarity is the sort of vectors looking the same, right?"},{"from":2995.34,"to":2997.61,"location":2,"content":"Um, and so that's the heart of it, right?"},{"from":2997.61,"to":3000.13,"location":2,"content":"So we're gonna have words that have similar vectors,"},{"from":3000.13,"to":3004.24,"location":2,"content":"IS close together in the vector space have similar meaning."},{"from":3004.24,"to":3006.58,"location":2,"content":"Um, so for the rest of it- um,"},{"from":3006.58,"to":3010.33,"location":2,"content":"so the next thing we do is take that number and put an X around it."},{"from":3010.33,"to":3012.1,"location":2,"content":"So, um, the exponential has"},{"from":3012.1,"to":3015.3,"location":2,"content":"this nice property that no matter what number you stick into it,"},{"from":3015.3,"to":3017.84,"location":2,"content":"because the dot product might be positive or negative,"},{"from":3017.84,"to":3020.89,"location":2,"content":"it's gonna come out as a positive number and if"},{"from":3020.89,"to":3024.16,"location":2,"content":"we eventually wanna get a probability, um, that's really good."},{"from":3024.16,"to":3028.45,"location":2,"content":"If we have positive numbers and not negative numbers, um, so that's good."},{"from":3028.45,"to":3033.37,"location":2,"content":"Um, then the third part of which is the bid in blue is we wanted to have"},{"from":3033.37,"to":3036.07,"location":2,"content":"probabilities and probabilities are meant to add up to"},{"from":3036.07,"to":3039.97,"location":2,"content":"one and so we do that in the standard, dumbest possible way."},{"from":3039.97,"to":3042.2,"location":2,"content":"We sum up what this quantity is,"},{"from":3042.2,"to":3047.08,"location":2,"content":"that every different word in our vocabulary and we divide through by"},{"from":3047.08,"to":3052.32,"location":2,"content":"it and so that normalizes things and turns them into a probability distribution."},{"from":3052.32,"to":3054.68,"location":2,"content":"Yeah, so there's sort of in practice,"},{"from":3054.68,"to":3055.99,"location":2,"content":"there are two parts."},{"from":3055.99,"to":3059.11,"location":2,"content":"There's the orange part which is this idea of using"},{"from":3059.11,"to":3063.58,"location":2,"content":"dot product and a vector space as our similarity measure between words"},{"from":3063.58,"to":3067.48,"location":2,"content":"and then the second part is all the rest of it where we feed it"},{"from":3067.48,"to":3071.66,"location":2,"content":"through what we refer to a news all the time as a softmax distribution."},{"from":3071.66,"to":3077.53,"location":2,"content":"So the two parts of the expen normalizing gives you a softmax distribution."},{"from":3077.53,"to":3082.12,"location":2,"content":"Um, and softmax functions will sort of map any numbers into"},{"from":3082.12,"to":3086.95,"location":2,"content":"a probability distribution always for the two reasons that I gave and so,"},{"from":3086.95,"to":3090,"location":2,"content":"it's referred to as a softmax um,"},{"from":3090,"to":3093.53,"location":2,"content":"because it works like a softmax, right?"},{"from":3093.53,"to":3095.04,"location":2,"content":"So if you have numbers,"},{"from":3095.04,"to":3099.74,"location":2,"content":"you could just say what's the max of these numbers, um,"},{"from":3099.74,"to":3106.81,"location":2,"content":"and you know that's sort of a hot- if you sort of map your original numbers into,"},{"from":3106.81,"to":3109.39,"location":2,"content":"if it's the max of the max and everything else is zero,"},{"from":3109.39,"to":3111.16,"location":2,"content":"that's sort of a hard max."},{"from":3111.16,"to":3116.93,"location":2,"content":"Um, soft- this is a softmax because the exponenti- you know,"},{"from":3116.93,"to":3120.31,"location":2,"content":"if you sort of imagine this but- if we just ignore the problem"},{"from":3120.31,"to":3124.32,"location":2,"content":"negative numbers for a moment and you got rid of the exp, um,"},{"from":3124.32,"to":3126.22,"location":2,"content":"then you'd sort of coming out with"},{"from":3126.22,"to":3129.64,"location":2,"content":"a probability distribution but by and large it's so be fairly"},{"from":3129.64,"to":3132.07,"location":2,"content":"flat and wouldn't particularly pick out the max of"},{"from":3132.07,"to":3135.31,"location":2,"content":"the different XI numbers whereas when you exponentiate them,"},{"from":3135.31,"to":3138.67,"location":2,"content":"that sort of makes big numbers way bigger and so, this,"},{"from":3138.67,"to":3145.99,"location":2,"content":"this softmax sort of mainly puts mass where the max's or the couple of max's are."},{"from":3145.99,"to":3149.92,"location":2,"content":"Um, so that's the max part and a soft part is that this isn't"},{"from":3149.92,"to":3154.9,"location":2,"content":"a hard decisions still spreads a little bit of probability mass everywhere else."},{"from":3154.9,"to":3160.54,"location":2,"content":"Okay, so now we have uh, loss function."},{"from":3160.54,"to":3165.16,"location":2,"content":"We have a loss function with a probability model on the inside that we can"},{"from":3165.16,"to":3170.23,"location":2,"content":"build and so what we want to be able to do is then um,"},{"from":3170.23,"to":3175.69,"location":2,"content":"move our vector representations of words around"},{"from":3175.69,"to":3181.07,"location":2,"content":"so that they are good at predicting what words occur in the context of other words."},{"from":3181.07,"to":3186.4,"location":2,"content":"Um, and so, at this point what we're gonna do is optimization."},{"from":3186.4,"to":3190.47,"location":2,"content":"So, we have vector components of different words."},{"from":3190.47,"to":3193.18,"location":2,"content":"We have a very high-dimensional space again but here,"},{"from":3193.18,"to":3196.27,"location":2,"content":"I've just got two for the picture and we're gonna wanna"},{"from":3196.27,"to":3199.51,"location":2,"content":"say how- how can we minimize this function and we're going to"},{"from":3199.51,"to":3203.92,"location":2,"content":"want to jiggle the numbers that are used in the word representations in"},{"from":3203.92,"to":3208.99,"location":2,"content":"such a way that we're walking down the slope of this space."},{"from":3208.99,"to":3212.09,"location":2,"content":"I walking down the gradient and um,"},{"from":3212.09,"to":3217.33,"location":2,"content":"then we're gonna minimize the function we found good representations for words."},{"from":3217.33,"to":3219.78,"location":2,"content":"So doing this for this case,"},{"from":3219.78,"to":3222.07,"location":2,"content":"we want to make a very big vector in"},{"from":3222.07,"to":3225.4,"location":2,"content":"a very high-dimensional vector space of all the parameters of"},{"from":3225.4,"to":3228.73,"location":2,"content":"our model and the only parameters that this model"},{"from":3228.73,"to":3233.09,"location":2,"content":"has is literally the vector space representations of words."},{"from":3233.09,"to":3236.17,"location":2,"content":"So if there are a 100 dimensional word representations,"},{"from":3236.17,"to":3239.32,"location":2,"content":"they're sort of a 100 parameters for aardvark and context,"},{"from":3239.32,"to":3243.4,"location":2,"content":"100 parameters for the word a- in context et cetera going through,"},{"from":3243.4,"to":3248.02,"location":2,"content":"100 parameters for the word aardvark [NOISE] as a center word et cetera,"},{"from":3248.02,"to":3252.52,"location":2,"content":"et cetera through that gives us a high big vector of parameters to"},{"from":3252.52,"to":3258.26,"location":2,"content":"optimize and we're gonna run this optimization and then um, move them down."},{"from":3258.26,"to":3263.74,"location":2,"content":"Um, [NOISE] yeah so that's essentially what you do."},{"from":3263.74,"to":3266.36,"location":2,"content":"Um, I sort of wanted to go through um,"},{"from":3266.36,"to":3268.99,"location":2,"content":"the details of this um,"},{"from":3268.99,"to":3272.44,"location":2,"content":"just so we've kind of gone through things concretely to"},{"from":3272.44,"to":3276.07,"location":2,"content":"make sure everyone is on the same page."},{"from":3276.07,"to":3279.47,"location":2,"content":"Um, so I suspect that, you know,"},{"from":3279.47,"to":3283.51,"location":2,"content":"if I try and do this concretely,"},{"from":3283.51,"to":3285.86,"location":2,"content":"um, there are a lot of people um,"},{"from":3285.86,"to":3290.83,"location":2,"content":"that this will bore and some people that are- will bore very badly,"},{"from":3290.83,"to":3294.41,"location":2,"content":"um, so I apologize for you,"},{"from":3294.41,"to":3295.81,"location":2,"content":"um, but you know,"},{"from":3295.81,"to":3299.14,"location":2,"content":"I'm hoping and thinking that there's probably"},{"from":3299.14,"to":3302.65,"location":2,"content":"some people who haven't done as much of this stuff recently"},{"from":3302.65,"to":3305.74,"location":2,"content":"and it might just actually be good to do it concretely"},{"from":3305.74,"to":3309.76,"location":2,"content":"and get everyone up to speed right at the beginning. Yeah?"},{"from":3309.76,"to":3314.68,"location":2,"content":"[inaudible] how do we calculate [inaudible] specifically?"},{"from":3314.68,"to":3320.28,"location":2,"content":"Well, so, we- so the way we calculate the,"},{"from":3320.28,"to":3326.05,"location":2,"content":"the U and V vectors is we're literally going to start with a random vector for"},{"from":3326.05,"to":3333.01,"location":2,"content":"each word and then we iteratively going to change those vectors a little bit as we learn."},{"from":3333.01,"to":3337.14,"location":2,"content":"And the way we're going to work out how to change them is we're gonna say,"},{"from":3337.14,"to":3342.4,"location":2,"content":"\"I want to do optimization,\" and that is going to be implemented as okay."},{"from":3342.4,"to":3344.83,"location":2,"content":"We have the current vectors for each word."},{"from":3344.83,"to":3351.55,"location":2,"content":"Let me do some calculus to work out how I could change the word vectors, um, to mean,"},{"from":3351.55,"to":3355.78,"location":2,"content":"that the word vectors would calculate a higher probability for"},{"from":3355.78,"to":3360.16,"location":2,"content":"the words that actually occur in contexts of this center word."},{"from":3360.16,"to":3361.86,"location":2,"content":"And we will do that,"},{"from":3361.86,"to":3363.93,"location":2,"content":"and we'll do it again and again and again,"},{"from":3363.93,"to":3366.76,"location":2,"content":"and then will eventually end up with good word vectors."},{"from":3366.76,"to":3368.26,"location":2,"content":"Thank you for that question,"},{"from":3368.26,"to":3370.78,"location":2,"content":"cause that's a concept that you're meant to have understood."},{"from":3370.78,"to":3373.33,"location":2,"content":"Is that how this works and maybe I didn't"},{"from":3373.33,"to":3376.64,"location":2,"content":"explain that high-level recipe well enough, yeah."},{"from":3376.64,"to":3380.41,"location":2,"content":"Okay, so yeah, so let's just go through it. So, we've seen it, right?"},{"from":3380.41,"to":3384.07,"location":2,"content":"So, we had this formula that we wanted to maximize, you know,"},{"from":3384.07,"to":3392.41,"location":2,"content":"our original function which was the product of T equals one to T,"},{"from":3392.41,"to":3395.99,"location":2,"content":"and then the product of the words, uh,"},{"from":3395.99,"to":3400.72,"location":2,"content":"position minus M less than or equal to J,"},{"from":3400.72,"to":3402.46,"location":2,"content":"less than or equal to M,"},{"from":3402.46,"to":3406,"location":2,"content":"J not equal to zero of, um,"},{"from":3406,"to":3411.64,"location":2,"content":"the probability of W. At prime at T"},{"from":3411.64,"to":3417.7,"location":2,"content":"plus J given WT according to the parameters of our model."},{"from":3417.7,"to":3421.33,"location":2,"content":"Okay, and then we'd already seen that we were gonna convert that"},{"from":3421.33,"to":3425.51,"location":2,"content":"into the function that we're going to use where we have J of Theta,"},{"from":3425.51,"to":3435.49,"location":2,"content":"where we had the minus one on T. Of the sum of T equals one to T of the sum of minus M,"},{"from":3435.49,"to":3437.77,"location":2,"content":"less than or equal to J less than or equal to M,"},{"from":3437.77,"to":3447.4,"location":2,"content":"J not equal to zero of the log of the probability of W times T, plus J, W,"},{"from":3447.4,"to":3451.84,"location":2,"content":"T. Okay, so we had that and then we'd had"},{"from":3451.84,"to":3456.49,"location":2,"content":"this formula that the probability of the outside word given"},{"from":3456.49,"to":3466.36,"location":2,"content":"the context word is this formula we just went through of xu ot vc over"},{"from":3466.36,"to":3476.77,"location":2,"content":"the sum of W equals one to the vocabulary size of xu wt vc."},{"from":3476.77,"to":3479.53,"location":2,"content":"Okay, so that's sort of our model."},{"from":3479.53,"to":3483.84,"location":2,"content":"We want to min- minimize this."},{"from":3483.84,"to":3491.23,"location":2,"content":"So, we wanna minimize this and we want to minimize that by changing these parameters."},{"from":3491.23,"to":3495.41,"location":2,"content":"And these parameters are the contents of these vectors."},{"from":3495.41,"to":3497.64,"location":2,"content":"And so, what we want to do now,"},{"from":3497.64,"to":3503.56,"location":2,"content":"is do calculus and we wanna say let's work out in terms of these parameters which are,"},{"from":3503.56,"to":3505.96,"location":2,"content":"u and v vectors, um,"},{"from":3505.96,"to":3510.11,"location":2,"content":"for the current values of the parameters which we initialized randomly."},{"from":3510.11,"to":3512.05,"location":2,"content":"Like what's the slope of the space?"},{"from":3512.05,"to":3513.49,"location":2,"content":"Where is downhill?"},{"from":3513.49,"to":3515.77,"location":2,"content":"Because if we can work out downhill is,"},{"from":3515.77,"to":3519.11,"location":2,"content":"we got just gotta walk downhill and our model gets better."},{"from":3519.11,"to":3522.01,"location":2,"content":"So, we're gonna take derivatives and work out what"},{"from":3522.01,"to":3525.61,"location":2,"content":"direction downhill is and then we wanna walk that way, yeah."},{"from":3525.61,"to":3530.23,"location":2,"content":"So, why do we wanna maximize that probable edge and like,"},{"from":3530.23,"to":3531.8,"location":2,"content":"like going through every word,"},{"from":3531.8,"to":3537.64,"location":2,"content":"it's like [inaudible] given the [inaudible]"},{"from":3537.64,"to":3539.66,"location":2,"content":"So, well, so, so,"},{"from":3539.66,"to":3542.91,"location":2,"content":"I'm wanting to achieve this, um,"},{"from":3542.91,"to":3548.39,"location":2,"content":"what I want to achieve for my distributional notion of meaning is,"},{"from":3548.39,"to":3551.5,"location":2,"content":"I have a meaningful word, a vector."},{"from":3551.5,"to":3556.81,"location":2,"content":"And that vector knows what words occur in the context of,"},{"from":3556.81,"to":3559.53,"location":2,"content":"um, a word- of itself."},{"from":3559.53,"to":3563.02,"location":2,"content":"And knowing what words occur in its context means,"},{"from":3563.02,"to":3564.79,"location":2,"content":"it can accurately give"},{"from":3564.79,"to":3568.95,"location":2,"content":"a high probability estimate to those words that occur in the context,"},{"from":3568.95,"to":3572.32,"location":2,"content":"and it will give low probability estimates"},{"from":3572.32,"to":3575.05,"location":2,"content":"to words that don't typically occur in the context."},{"from":3575.05,"to":3577.24,"location":2,"content":"So, you know, if the word is bank,"},{"from":3577.24,"to":3579.55,"location":2,"content":"I'm hoping that words like branch,"},{"from":3579.55,"to":3581.57,"location":2,"content":"and open, and withdrawal,"},{"from":3581.57,"to":3583.36,"location":2,"content":"will be given high probability,"},{"from":3583.36,"to":3585.45,"location":2,"content":"cause they tend to occur with the word bank."},{"from":3585.45,"to":3589.95,"location":2,"content":"And I'm hoping that some other words, um,"},{"from":3589.95,"to":3592.74,"location":2,"content":"like neural network or something have"},{"from":3592.74,"to":3598.29,"location":2,"content":"a lower probability because they don't tend to occur with the word bank."},{"from":3598.29,"to":3601.53,"location":2,"content":"Okay, um, does that make sense?"},{"from":3601.53,"to":3601.78,"location":2,"content":"Yeah."},{"from":3601.78,"to":3603.73,"location":2,"content":"Yeah. And the other thing I was,"},{"from":3603.73,"to":3606.86,"location":2,"content":"I'd forgotten meant to comment was, you know, obviously,"},{"from":3606.86,"to":3610.48,"location":2,"content":"we're not gonna be able to do this super well or it's just not gonna be able,"},{"from":3610.48,"to":3613.18,"location":2,"content":"that we can say all the words in the context is going to"},{"from":3613.18,"to":3615.88,"location":2,"content":"be this word with probability 0.97, right?"},{"from":3615.88,"to":3619.75,"location":2,"content":"Because we're using this one simple probability distribution"},{"from":3619.75,"to":3623.23,"location":2,"content":"to predict all words in our context."},{"from":3623.23,"to":3627.88,"location":2,"content":"So, in particular, we're using it to predict 10 different words generally, right?"},{"from":3627.88,"to":3632.43,"location":2,"content":"So, at best, we can kind of be giving sort of five percent chance to one of them, right?"},{"from":3632.43,"to":3633.82,"location":2,"content":"We can't possibly be,"},{"from":3633.82,"to":3635.95,"location":2,"content":"so guessing right every time."},{"from":3635.95,"to":3637.39,"location":2,"content":"Um, and well, you know,"},{"from":3637.39,"to":3640.26,"location":2,"content":"they're gonna be different contexts with different words in them."},{"from":3640.26,"to":3644.61,"location":2,"content":"So, you know, it's gonna be a very loose model,"},{"from":3644.61,"to":3648.66,"location":2,"content":"but nevertheless, we wanna capture the fact that, you know,"},{"from":3648.66,"to":3651.33,"location":2,"content":"withdrawal is much more likely, um,"},{"from":3651.33,"to":3657.58,"location":2,"content":"to occur near the word bank than something like football."},{"from":3657.58,"to":3661.03,"location":2,"content":"That's, you know, basically what our goal is."},{"from":3661.03,"to":3667.36,"location":2,"content":"Okay, um, yes, so we want to maximize this,"},{"from":3667.36,"to":3672.61,"location":2,"content":"by minimizing this, which means we then want to do some calculus to work this out."},{"from":3672.61,"to":3674.74,"location":2,"content":"So, what we're then gonna do is,"},{"from":3674.74,"to":3676.72,"location":2,"content":"that we're going to say, well,"},{"from":3676.72,"to":3679.49,"location":2,"content":"these parameters are our word vectors"},{"from":3679.49,"to":3682.63,"location":2,"content":"and we're gonna sort of want to move these word vectors,"},{"from":3682.63,"to":3688.18,"location":2,"content":"um, to, um, work things out as to how to, um, walk downhill."},{"from":3688.18,"to":3692.44,"location":2,"content":"So, the case that I'm going to do now is gonna look at the parameters of"},{"from":3692.44,"to":3698.28,"location":2,"content":"this center word vc and work out how to do things with respect to it."},{"from":3698.28,"to":3700.75,"location":2,"content":"Um, now, that's not the only thing that you wanna do,"},{"from":3700.75,"to":3704.91,"location":2,"content":"you also want to work out the slope with respect to the uo vector."},{"from":3704.91,"to":3707.97,"location":2,"content":"Um, but I'm not gonna do that because time in class is going to run out."},{"from":3707.97,"to":3709.75,"location":2,"content":"So, it'd be really good if you did that one at"},{"from":3709.75,"to":3711.72,"location":2,"content":"home and then you'd feel much more competent."},{"from":3711.72,"to":3717.13,"location":2,"content":"Right, so then, um, so what I'm wanting you to do is work out the partial derivative with"},{"from":3717.13,"to":3723.2,"location":2,"content":"respect to my vc vector representation of this quantity,"},{"from":3723.2,"to":3724.81,"location":2,"content":"that we were just looking at."},{"from":3724.81,"to":3728.29,"location":2,"content":"Which is, um, the quantity in here,"},{"from":3728.29,"to":3731.98,"location":2,"content":"um, where we're taking the log of that quantity."},{"from":3731.98,"to":3737.56,"location":2,"content":"Right, the log of the x of u,"},{"from":3737.56,"to":3740.14,"location":2,"content":"o, T, v, c,"},{"from":3740.14,"to":3746.83,"location":2,"content":"over the sum of W equals one to V of the x of u,"},{"from":3746.83,"to":3750.22,"location":2,"content":"o, T, v, c. Okay,"},{"from":3750.22,"to":3753.22,"location":2,"content":"so this, um, so now we have a log of the division,"},{"from":3753.22,"to":3755.7,"location":2,"content":"so that's easy to rewrite, um,"},{"from":3755.7,"to":3759.59,"location":2,"content":"that we have a partial derivative of the log of"},{"from":3759.59,"to":3767.56,"location":2,"content":"the numerator minus and"},{"from":3767.56,"to":3769.69,"location":2,"content":"I can distribute the partial derivative."},{"from":3769.69,"to":3773.39,"location":2,"content":"So, I can have minus the partial derivative,"},{"from":3773.39,"to":3776.68,"location":2,"content":"um, of the denominator,"},{"from":3776.68,"to":3779.71,"location":2,"content":"um, which is log of this thing."},{"from":3779.71,"to":3788.86,"location":2,"content":"[NOISE]"},{"from":3788.86,"to":3799.19,"location":2,"content":"Okay. Um, so this is sort of what was the numerator and this is what was the denominator."},{"from":3799.19,"to":3807.06,"location":2,"content":"Okay. So, um, the part that was the numerator is really easy."},{"from":3807.06,"to":3809.13,"location":2,"content":"In fact maybe I can fit it in here."},{"from":3809.13,"to":3813.45,"location":2,"content":"Um, so log on exp are just inverses of each other,"},{"from":3813.45,"to":3814.8,"location":2,"content":"so they cancel out."},{"from":3814.8,"to":3823.65,"location":2,"content":"So, we've got the partial derivative of U_o T V_c."},{"from":3823.65,"to":3827.46,"location":2,"content":"Okay, so this point I should, um, just, um,"},{"from":3827.46,"to":3831.63,"location":2,"content":"remind people right that this V_c here's a vector of- um,"},{"from":3831.63,"to":3836.13,"location":2,"content":"it's still a vector right because we had a 100 dimensional representation of a word."},{"from":3836.13,"to":3840.33,"location":2,"content":"Um, so this is doing multivariate calculus."},{"from":3840.33,"to":3842.79,"location":2,"content":"Um, so you know, if you're,"},{"from":3842.79,"to":3844.53,"location":2,"content":"if you at all, um,"},{"from":3844.53,"to":3846.11,"location":2,"content":"remember any of this stuff,"},{"from":3846.11,"to":3848.18,"location":2,"content":"you can say, \"Ha this is trivial\"."},{"from":3848.18,"to":3852.39,"location":2,"content":"The answer to that is you are done, um and that's great."},{"from":3852.39,"to":3854.95,"location":2,"content":"But you know, if you're, um, feeling, um,"},{"from":3854.95,"to":3857.55,"location":2,"content":"not so good on all of this stuff, um,"},{"from":3857.55,"to":3859.13,"location":2,"content":"and you wanna sort of, um,"},{"from":3859.13,"to":3862.44,"location":2,"content":"cheat a little on the side and try and work out what it is,"},{"from":3862.44,"to":3864.18,"location":2,"content":"um, you can sort of say,"},{"from":3864.18,"to":3865.98,"location":2,"content":"\"Well, let me um,,"},{"from":3865.98,"to":3868.38,"location":2,"content":"work out the partial derivative,"},{"from":3868.38,"to":3874.2,"location":2,"content":"um with respect to one element of this vector like the first element of this vector\"."},{"from":3874.2,"to":3882.87,"location":2,"content":"Well, what I actually got here for this dot product is I have U_o one times V_c one,"},{"from":3882.87,"to":3889.56,"location":2,"content":"plus U_o two times V_c two plus dot, dot,"},{"from":3889.56,"to":3896.91,"location":2,"content":"dot plus U_o 100 times V_c 100, right,"},{"from":3896.91,"to":3902.53,"location":2,"content":"and I'm finding the partial derivative of this with respect to V_c one,"},{"from":3902.53,"to":3905.49,"location":2,"content":"and hopefully remember that much calculus from high school"},{"from":3905.49,"to":3909.14,"location":2,"content":"of none of these terms involve V_c one."},{"from":3909.14,"to":3912.66,"location":2,"content":"So, the only thing that's left is this U_o one,"},{"from":3912.66,"to":3915.96,"location":2,"content":"and that's what I've got there for this dimension."},{"from":3915.96,"to":3917.85,"location":2,"content":"So, this particular parameter."},{"from":3917.85,"to":3923.26,"location":2,"content":"But I don't only want to do the first component of the V_c vector,"},{"from":3923.26,"to":3926.74,"location":2,"content":"I also want to do the second component of the V_c vector et cetera,"},{"from":3926.74,"to":3930.63,"location":2,"content":"which means I'm going to end up with all of them"},{"from":3930.63,"to":3935.68,"location":2,"content":"turning up in precisely one of these things."},{"from":3935.68,"to":3941.19,"location":2,"content":"Um, and so the end result is I get the vector U_o."},{"from":3941.19,"to":3943.62,"location":2,"content":"Okay. Um, but you know,"},{"from":3943.62,"to":3947.22,"location":2,"content":"if you're sort of getting confused and your brain is falling apart,"},{"from":3947.22,"to":3952.05,"location":2,"content":"I think it can be sort of kind of useful to re- reduce things to sort of um,"},{"from":3952.05,"to":3958.28,"location":2,"content":"single dimensional calculus and actually sort of play out what's actually happening."},{"from":3958.28,"to":3960.84,"location":2,"content":"Um, anyway, this part was easy."},{"from":3960.84,"to":3963.54,"location":2,"content":"The numerator, we get um, U_o."},{"from":3963.54,"to":3968.09,"location":2,"content":"Um, so things aren't quite so nice when we do the denominator."},{"from":3968.09,"to":3971.64,"location":2,"content":"So we now want to have this, um, B_d,"},{"from":3971.64,"to":3977.01,"location":2,"content":"V_c of the log of the sum of W equals"},{"from":3977.01,"to":3982.84,"location":2,"content":"one to the P_x of U_o T V_c."},{"from":3982.84,"to":3985.8,"location":2,"content":"Okay. So, now at this point,"},{"from":3985.8,"to":3987.45,"location":2,"content":"I'm not quite so pretty."},{"from":3987.45,"to":3991.03,"location":2,"content":"We've got this log sum X combination that you see a lot,"},{"from":3991.03,"to":3995.64,"location":2,"content":"and so at this point you have to remember that there was E, the chain rule."},{"from":3995.64,"to":3998.52,"location":2,"content":"Okay. So, what we can say is here's you know,"},{"from":3998.52,"to":4002.54,"location":2,"content":"our function F and here is the body of the function,"},{"from":4002.54,"to":4006.24,"location":2,"content":"and so what we want to do is um,"},{"from":4006.24,"to":4008.63,"location":2,"content":"do it in two stages."},{"from":4008.63,"to":4011.57,"location":2,"content":"Um, so that at the end of the day,"},{"from":4011.57,"to":4013.43,"location":2,"content":"we've got this V_c at the end."},{"from":4013.43,"to":4017.11,"location":2,"content":"So, we have sort of some function here."},{"from":4017.11,"to":4019.91,"location":2,"content":"There's ultimately a function of V_c,"},{"from":4019.91,"to":4022.22,"location":2,"content":"and so we gonna do with a chain rule."},{"from":4022.22,"to":4025.04,"location":2,"content":"We'll say the chain rule is we first take"},{"from":4025.04,"to":4029.14,"location":2,"content":"the derivative of this outside thing putting in this body,"},{"from":4029.14,"to":4033.68,"location":2,"content":"and then we remember that the derivative of log is one on X."},{"from":4033.68,"to":4042.92,"location":2,"content":"So, we have one over the sum of W equals one to V of the exp of U_o T V_c"},{"from":4042.92,"to":4046.64,"location":2,"content":"and then we need to multiply that by then taking"},{"from":4046.64,"to":4052.61,"location":2,"content":"the derivative of the inside part which is um,"},{"from":4052.61,"to":4060.49,"location":2,"content":"what we have here."},{"from":4060.49,"to":4064.85,"location":2,"content":"Okay. Times the derivative of the inside part with"},{"from":4064.85,"to":4068.6,"location":2,"content":"the important reminder that you need to do a change of variables,"},{"from":4068.6,"to":4073.46,"location":2,"content":"and for the inside part use a different variable that you're summing over."},{"from":4073.46,"to":4080.81,"location":2,"content":"Okay. So, now we're trying to find the derivative of a sum of X."},{"from":4080.81,"to":4085.05,"location":2,"content":"The first thing that we can do is v-very easy."},{"from":4085.05,"to":4088.86,"location":2,"content":"We can move the derivative inside a sum."},{"from":4088.86,"to":4094.43,"location":2,"content":"So, we can rewrite that and have at the sum first of the X equals one to"},{"from":4094.43,"to":4100.43,"location":2,"content":"V of the partial derivatives with respect to V_c of the [inaudible]."},{"from":4100.43,"to":4102.57,"location":2,"content":"Um, so that's a little bit of progress."},{"from":4102.57,"to":4106.73,"location":2,"content":"Um and that point we have to sort of do the chain rule again, right."},{"from":4106.73,"to":4113.21,"location":2,"content":"So, here is our function and here's the thing in it again which is some function of V_c."},{"from":4113.21,"to":4117.6,"location":2,"content":"So, we again want to do um, the chain rule."},{"from":4117.6,"to":4121.34,"location":2,"content":"So, [NOISE] we then have well,"},{"from":4121.34,"to":4125.72,"location":2,"content":"the derivative of X um, is exp."},{"from":4125.72,"to":4134.63,"location":2,"content":"So, we gonna have the sum of X equals one to V of exp of U_x T V_c,"},{"from":4134.63,"to":4140.15,"location":2,"content":"and then we're going to multiply that by the partial derivative with"},{"from":4140.15,"to":4145.7,"location":2,"content":"respect to T V_c of the inside U_x T V_c."},{"from":4145.7,"to":4148.16,"location":2,"content":"Well, we saw that one before, so,"},{"from":4148.16,"to":4153.2,"location":2,"content":"the derivative of that is U- well,"},{"from":4153.2,"to":4156.32,"location":2,"content":"yeah, U_x because we're doing it through a different X, right."},{"from":4156.32,"to":4158.78,"location":2,"content":"This then becomes out as U_x,"},{"from":4158.78,"to":4163.85,"location":2,"content":"and so we have the sum of the X equals one to"},{"from":4163.85,"to":4170.03,"location":2,"content":"V of this exp U X T B C times the U_of X."},{"from":4170.03,"to":4174.99,"location":2,"content":"Okay. So, by doing the chain rule twice, we've got that."},{"from":4174.99,"to":4178.19,"location":2,"content":"So, now if we put it together, you know,"},{"from":4178.19,"to":4183.05,"location":2,"content":"the derivative of V_c with respect of the whole thing,"},{"from":4183.05,"to":4186.5,"location":2,"content":"this log of the probability of O given C, right."},{"from":4186.5,"to":4191.21,"location":2,"content":"That for the numerator it was just U_o,"},{"from":4191.21,"to":4194.03,"location":2,"content":"and then we're subtracting,"},{"from":4194.03,"to":4197.65,"location":2,"content":"we had this term here, um,"},{"from":4197.65,"to":4199.73,"location":2,"content":"which is sort of a denominator,"},{"from":4199.73,"to":4203.87,"location":2,"content":"and then we have this term here which is the numerator."},{"from":4203.87,"to":4207.73,"location":2,"content":"So, we're subtracting in the numerator,"},{"from":4207.73,"to":4212.27,"location":2,"content":"we have the sum of X equals one to V of"},{"from":4212.27,"to":4218.77,"location":2,"content":"the exp of U_x T V_c times U_x,"},{"from":4218.77,"to":4225.4,"location":2,"content":"and then in the denominator, we have um,"},{"from":4225.4,"to":4236.35,"location":2,"content":"the sum of W equals one to V of exp of U_w T V_c."},{"from":4236.35,"to":4240.03,"location":2,"content":"Um, okay, so we kind of get that."},{"from":4240.03,"to":4244.02,"location":2,"content":"Um, oh wait. Yeah. Yeah, I've gotten."},{"from":4244.02,"to":4245.9,"location":2,"content":"Yeah, that's right. Um, okay."},{"from":4245.9,"to":4252.17,"location":2,"content":"We kind of get that and then we can sort of just re-arrange this a little."},{"from":4252.17,"to":4256.48,"location":2,"content":"So, we can have this sum right out front,"},{"from":4256.48,"to":4263.28,"location":2,"content":"and we can say that this is sort of a big sum of X equals one to V,"},{"from":4263.28,"to":4269.87,"location":2,"content":"and we can sort of take that U_x out the end and say, okay."},{"from":4269.87,"to":4273.15,"location":2,"content":"Let's call that put over here a U_x,"},{"from":4273.15,"to":4275.09,"location":2,"content":"and if we do that,"},{"from":4275.09,"to":4280.3,"location":2,"content":"sort of an interesting thing has happened because look right here,"},{"from":4280.3,"to":4285.83,"location":2,"content":"we've rediscovered exactly the same form"},{"from":4285.83,"to":4291.43,"location":2,"content":"that we use as our probability distribution for predicting the probability of words."},{"from":4291.43,"to":4297.86,"location":2,"content":"So, this is now simply the probability of X given C according to our model."},{"from":4297.86,"to":4306.15,"location":2,"content":"Um, so we can rewrite this and say that what we're getting is U_o minus the sum of"},{"from":4306.15,"to":4314.8,"location":2,"content":"X equals one to V of the probability of X given C times U_x."},{"from":4314.8,"to":4318.76,"location":2,"content":"This has a kind of an interesting meaning if you think about it."},{"from":4318.76,"to":4321.36,"location":2,"content":"So, this is actually giving us, you know,"},{"from":4321.36,"to":4324.2,"location":2,"content":"our slope in this multi-dimensional space"},{"from":4324.2,"to":4327.22,"location":2,"content":"and how we're getting that slope is we're taking"},{"from":4327.22,"to":4331.28,"location":2,"content":"the observed representation of"},{"from":4331.28,"to":4338.45,"location":2,"content":"the context word and we're subtracting from that what our model thinks um,"},{"from":4338.45,"to":4340.95,"location":2,"content":"the context should look like."},{"from":4340.95,"to":4344.47,"location":2,"content":"What does the model think that the context should look like?"},{"from":4344.47,"to":4347.33,"location":2,"content":"This part here is formal in expectation."},{"from":4347.33,"to":4351.4,"location":2,"content":"So, what you're doing is you're finding the weighted average"},{"from":4351.4,"to":4356.38,"location":2,"content":"of the models of the representations of each word,"},{"from":4356.38,"to":4359.99,"location":2,"content":"multiplied by the probability of it in the current model."},{"from":4359.99,"to":4365.31,"location":2,"content":"So, this is sort of the expected context word according to our current model,"},{"from":4365.31,"to":4367.46,"location":2,"content":"and so we're taking the difference between"},{"from":4367.46,"to":4372.17,"location":2,"content":"the expected context word and the actual context word that showed up,"},{"from":4372.17,"to":4375.56,"location":2,"content":"and that difference then turns out to exactly give"},{"from":4375.56,"to":4378.89,"location":2,"content":"us the slope as to which direction we should be"},{"from":4378.89,"to":4381.05,"location":2,"content":"walking changing the words"},{"from":4381.05,"to":4386.72,"location":2,"content":"representation in order to improve our model's ability to predict."},{"from":4386.72,"to":4391.56,"location":2,"content":"Okay. Um, so we'll,"},{"from":4391.56,"to":4394.1,"location":2,"content":"um, assignment two, um, yeah."},{"from":4394.1,"to":4398.06,"location":2,"content":"So, um, it'll be a great exercise for you guys,"},{"from":4398.06,"to":4400.11,"location":2,"content":"um, to in- um,"},{"from":4400.11,"to":4402.83,"location":2,"content":"to try and do that for the cen-, wait,"},{"from":4402.83,"to":4406.64,"location":2,"content":"um I did the center words trying to look context words as well"},{"from":4406.64,"to":4411.13,"location":2,"content":"and show you that you can do the same kind of piece of math and have it work out."},{"from":4411.13,"to":4415.65,"location":2,"content":"Um, if I've just got a few minutes left at the end."},{"from":4415.65,"to":4423.32,"location":2,"content":"Um, what I just wanted to show you if I can get all of this to work right."},{"from":4423.32,"to":4429.95,"location":2,"content":"Um, let's go [inaudible] this way."},{"from":4429.95,"to":4434.2,"location":2,"content":"Okay, find my."},{"from":4434.2,"to":4440.07,"location":2,"content":"Okay. Um, so I just wanted to just show you a quick example."},{"from":4440.07,"to":4441.94,"location":2,"content":"So, for the first assignment,"},{"from":4441.94,"to":4444.17,"location":2,"content":"um, again it's an iPython Notebook."},{"from":4444.17,"to":4449.02,"location":2,"content":"So, if you're all set up you sort of can do Jupyter Notebook."},{"from":4449.02,"to":4452.94,"location":2,"content":"Um, and you have some notebook."},{"from":4452.94,"to":4457.18,"location":2,"content":"Um, here's my little notebook I'm gonna show you,"},{"from":4457.18,"to":4470.94,"location":2,"content":"um, and the trick will be to make this big enough that people can see it."},{"from":4470.94,"to":4475.53,"location":2,"content":"That readable? [LAUGHTER] Okay, um,"},{"from":4475.53,"to":4479.21,"location":2,"content":"so right so, so Numpy is the sort of,"},{"from":4479.21,"to":4481.93,"location":2,"content":"um, do math package in Python."},{"from":4481.93,"to":4483.12,"location":2,"content":"You'll want to know about that."},{"from":4483.12,"to":4484.44,"location":2,"content":"If you don't know about it."},{"from":4484.44,"to":4486.44,"location":2,"content":"Um, Matplotlib is sort of the,"},{"from":4486.44,"to":4489.04,"location":2,"content":"one of the most basic graphing package"},{"from":4489.04,"to":4491.76,"location":2,"content":"if you don't know about that you're going to want to know about it."},{"from":4491.76,"to":4495.9,"location":2,"content":"This is sort of an IPython or Jupyter special that"},{"from":4495.9,"to":4499.76,"location":2,"content":"lets you have an interactive matplotlib um, inside."},{"from":4499.76,"to":4503.68,"location":2,"content":"And if you want to get fancy you can play it- play with your graphic styles."},{"from":4503.68,"to":4506.61,"location":2,"content":"Um, there's that."},{"from":4506.61,"to":4510.47,"location":2,"content":"Scikit-learn is kind of a general machine learning package."},{"from":4510.47,"to":4513.35,"location":2,"content":"Um, Gensim isn't a deep learning package."},{"from":4513.35,"to":4517.59,"location":2,"content":"Gensim is kind of a word similarity package which started off um,"},{"from":4517.59,"to":4520.76,"location":2,"content":"with um, methods like Latent Dirichlet analysis."},{"from":4520.76,"to":4522.53,"location":2,"content":"If you know about that from modelling words"},{"from":4522.53,"to":4525.94,"location":2,"content":"similarities that sort of grown as a good package um,"},{"from":4525.94,"to":4528.57,"location":2,"content":"for doing um, word vectors as well."},{"from":4528.57,"to":4531.65,"location":2,"content":"So, it's quite often used for word vectors and"},{"from":4531.65,"to":4536.1,"location":2,"content":"word similarities that sort of efficient for doing things at large-scale."},{"from":4536.1,"to":4537.72,"location":2,"content":"Um, yeah."},{"from":4537.72,"to":4541.36,"location":2,"content":"So, I haven't yet told you about will next time we have"},{"from":4541.36,"to":4546.4,"location":2,"content":"our own homegrown form of word vectors which are the GloVe word vectors."},{"from":4546.4,"to":4551.27,"location":2,"content":"I'm using them not because it really matters for what I'm showing but you know,"},{"from":4551.27,"to":4555.74,"location":2,"content":"these vectors are conveniently small."},{"from":4555.74,"to":4560.47,"location":2,"content":"It turns out that the vectors that Facebook and Google"},{"from":4560.47,"to":4565.94,"location":2,"content":"distribute are extremely large vocabulary and extremely high dimensional."},{"from":4565.94,"to":4568.94,"location":2,"content":"So take me just too long to load them in"},{"from":4568.94,"to":4572.86,"location":2,"content":"the last five minutes of this class where conveniently uh,"},{"from":4572.86,"to":4576.86,"location":2,"content":"in our Stanford vectors we have a 100 dimensional vectors, um,"},{"from":4576.86,"to":4579.16,"location":2,"content":"and 50 dimensional vectors which are kinda"},{"from":4579.16,"to":4581.76,"location":2,"content":"good for doing small things on a laptop frankly."},{"from":4581.76,"to":4587.33,"location":2,"content":"Um, so, what I'm doing here is Gensim doesn't natively support"},{"from":4587.33,"to":4590.21,"location":2,"content":"GloVe vectors but they actually provide a utility that"},{"from":4590.21,"to":4593.39,"location":2,"content":"converts the GloVe file format to the word2vec file format."},{"from":4593.39,"to":4600.28,"location":2,"content":"So I've done that. And then I've loaded a pre-trained model of word vectors."},{"from":4600.28,"to":4604.43,"location":2,"content":"Um, and, so this is what they call a keyed vector."},{"from":4604.43,"to":4606.89,"location":2,"content":"And so, the keyed vector is nothing fancy."},{"from":4606.89,"to":4611.66,"location":2,"content":"It's just you have words like potato and there's a vector that hangs off each one."},{"from":4611.66,"to":4615.44,"location":2,"content":"So it's really just sort of a big dictionary with a vector for each thing."},{"from":4615.44,"to":4618.69,"location":2,"content":"But, so this model has been a trained model where"},{"from":4618.69,"to":4622.23,"location":2,"content":"we just use the kind of algorithm we looked at and,"},{"from":4622.23,"to":4626.73,"location":2,"content":"you know, trained at billions of times fiddling our word vectors."},{"from":4626.73,"to":4631.26,"location":2,"content":"Um, and once we have one we can then, um,"},{"from":4631.26,"to":4634.27,"location":2,"content":"ask questions like, we can say,"},{"from":4634.27,"to":4637.11,"location":2,"content":"what is the most similar word to some other words?"},{"from":4637.11,"to":4639.65,"location":2,"content":"So we could take something like, um,"},{"from":4639.65,"to":4643.18,"location":2,"content":"what are the most similar words to Obama let's say?"},{"from":4643.18,"to":4645.77,"location":2,"content":"And we get back Barrack, Bush, Clinton,"},{"from":4645.77,"to":4649.04,"location":2,"content":"McCain, Gore, Hillary Dole, Martin, Henry."},{"from":4649.04,"to":4651.43,"location":2,"content":"That seems actually kind of interesting."},{"from":4651.43,"to":4654.05,"location":2,"content":"These factors from a few years ago."},{"from":4654.05,"to":4657.15,"location":2,"content":"So we don't have a post- post-Obama staff."},{"from":4657.15,"to":4660.75,"location":2,"content":"I mean if you put in another word, um, you know,"},{"from":4660.75,"to":4664.1,"location":2,"content":"we can put in something like banana and we get coconut,"},{"from":4664.1,"to":4666.6,"location":2,"content":"mango, bananas, potato, pineapple."},{"from":4666.6,"to":4669.43,"location":2,"content":"We get kind of tropical food."},{"from":4669.43,"to":4674.07,"location":2,"content":"So, you can actually- you can actually ask uh,"},{"from":4674.07,"to":4676.99,"location":2,"content":"for being dissimilar to words."},{"from":4676.99,"to":4679.7,"location":2,"content":"By itself dissimilar isn't very useful."},{"from":4679.7,"to":4684.55,"location":2,"content":"So if I ask most similar and I say um,"},{"from":4684.55,"to":4689.29,"location":2,"content":"negative equals, um, banana,"},{"from":4689.29,"to":4694.72,"location":2,"content":"um, I'm not sure what your concept of what's most dissimilar to,"},{"from":4694.72,"to":4696.62,"location":2,"content":"um, banana is, but you know,"},{"from":4696.62,"to":4702.65,"location":2,"content":"actually by itself you don't get anything useful out of this, um,"},{"from":4702.65,"to":4708,"location":2,"content":"because, um, you just so get these weird really rare words um,"},{"from":4708,"to":4711.44,"location":2,"content":"which, um, [LAUGHTER] definitely weren't the ones who are thinking of."},{"from":4711.44,"to":4717.57,"location":2,"content":"Um, but it turns out you can do something really useful with this negative idea"},{"from":4717.57,"to":4719,"location":2,"content":"which was one of"},{"from":4719,"to":4724.18,"location":2,"content":"the highly celebrated results of word vectors when they first started off."},{"from":4724.18,"to":4730.2,"location":2,"content":"And that was this idea that there is actually dimensions of meaning in this space."},{"from":4730.2,"to":4734.82,"location":2,"content":"And so this was the most celebrated example um, which was look,"},{"from":4734.82,"to":4739.98,"location":2,"content":"what we could do is we could start with the word king and subtract"},{"from":4739.98,"to":4745.35,"location":2,"content":"from it the meaning of man and then we could add to it the meaning of woman."},{"from":4745.35,"to":4749.11,"location":2,"content":"And then we could say which word in our vector space as"},{"from":4749.11,"to":4753.06,"location":2,"content":"most similar in meaning to that word."},{"from":4753.06,"to":4755.81,"location":2,"content":"And that would be a way of sort of doing analogies."},{"from":4755.81,"to":4758.64,"location":2,"content":"Would be able to do the, um, analogy,"},{"from":4758.64,"to":4762.05,"location":2,"content":"man is the king as woman is to what?"},{"from":4762.05,"to":4766.5,"location":2,"content":"And so, the way we're gonna do that is to say we want to be similar to king"},{"from":4766.5,"to":4771.22,"location":2,"content":"and woman because they're both positive ones and far away from man."},{"from":4771.22,"to":4775.19,"location":2,"content":"And so, we could do that manually,"},{"from":4775.19,"to":4776.95,"location":2,"content":"here is said manually,"},{"from":4776.95,"to":4781.05,"location":2,"content":"most similar positive woman king, negative man."},{"from":4781.05,"to":4785.41,"location":2,"content":"And we can run this and lo and behold it produces queen."},{"from":4785.41,"to":4788.57,"location":2,"content":"To make that a little bit easier I defined this analogy,"},{"from":4788.57,"to":4793.48,"location":2,"content":"um, analogy predicates so I can run other ones."},{"from":4793.48,"to":4799.1,"location":2,"content":"And so I can run another one like analogy Japan Japanese,"},{"from":4799.1,"to":4801.16,"location":2,"content":"Austria is to Austrian."},{"from":4801.16,"to":4803.13,"location":2,"content":"Um, and you know,"},{"from":4803.13,"to":4807.15,"location":2,"content":"I think it's fair to say that when people first"},{"from":4807.15,"to":4810.95,"location":2,"content":"saw that you could have this simple piece of math and run it,"},{"from":4810.95,"to":4812.95,"location":2,"content":"and learn meanings of words."},{"from":4812.95,"to":4818.47,"location":2,"content":"I mean it actually just sort of blew people's minds how effective this was."},{"from":4818.47,"to":4822.03,"location":2,"content":"You know. Like there- there's is no mirrors and strings here, right?"},{"from":4822.03,"to":4824.23,"location":2,"content":"You know it's not that I have a separate-"},{"from":4824.23,"to":4828.33,"location":2,"content":"a special sort of list in my Python where there's a difficult I'm looking up,"},{"from":4828.33,"to":4830.24,"location":2,"content":"er, for Austria Austrian,"},{"from":4830.24,"to":4831.91,"location":2,"content":"uh, and things like that."},{"from":4831.91,"to":4835.31,"location":2,"content":"But somehow these vector representations are"},{"from":4835.31,"to":4838.76,"location":2,"content":"such that it is actually encoding these semantic relationships,"},{"from":4838.76,"to":4840.92,"location":2,"content":"you know, so you can try different ones,"},{"from":4840.92,"to":4843.36,"location":2,"content":"you know, like it's not that only this one works."},{"from":4843.36,"to":4846.19,"location":2,"content":"I can put in France, it says French."},{"from":4846.19,"to":4849.78,"location":2,"content":"I can put in Germany, it says German,"},{"from":4849.78,"to":4854.59,"location":2,"content":"I can put in Australia not Austria and it says Australian,"},{"from":4854.59,"to":4859.48,"location":2,"content":"you know that somehow if you want this vector representations of words that"},{"from":4859.48,"to":4864.81,"location":2,"content":"for sort of these ideas like understanding the relationships between words,"},{"from":4864.81,"to":4870.6,"location":2,"content":"you're just doing this vector space manipulation on these 100 dimensional numbers,"},{"from":4870.6,"to":4875.83,"location":2,"content":"that it actually knows about them.This not only the similarities of word meanings but"},{"from":4875.83,"to":4878.26,"location":2,"content":"actually different semantic relationships"},{"from":4878.26,"to":4881.64,"location":2,"content":"between words like country names and their peoples."},{"from":4881.64,"to":4883.85,"location":2,"content":"And yeah that's actually pretty amazing."},{"from":4883.85,"to":4891.34,"location":2,"content":"It really-you know, it's sort of surprising that running such a dumb algorithm on um,"},{"from":4891.34,"to":4895.1,"location":2,"content":"vectors of numbers could capture so well the meaning of words."},{"from":4895.1,"to":4898.16,"location":2,"content":"And so that's sort of became the foundation of a lot of sort"},{"from":4898.16,"to":4901.35,"location":2,"content":"of modern distributed neural representations of words."},{"from":4901.35,"to":4902.63,"location":2,"content":"Okay I'll stop there."},{"from":4902.63,"to":4906.7,"location":2,"content":"Thanks a lot guys and see you on Thursday. [NOISE]"}]} \ No newline at end of file diff --git a/bcc-en/10.bcc b/bcc-en/10.bcc new file mode 100644 index 0000000000000000000000000000000000000000..4786c59ee269e6983f0d60fec2da560fc5d301ff --- /dev/null +++ b/bcc-en/10.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.48,"to":11.12,"location":2,"content":"Okay. Hi, everyone. Um, so let's get started again today."},{"from":11.12,"to":14.61,"location":2,"content":"So today's lecture what I'm going to do,"},{"from":14.61,"to":16.71,"location":2,"content":"is be talking about, um,"},{"from":16.71,"to":19.06,"location":2,"content":"question answering over text."},{"from":19.06,"to":22.02,"location":2,"content":"Um, this is another of the big successes"},{"from":22.02,"to":25.66,"location":2,"content":"in using deep learning inside natural language processing,"},{"from":25.66,"to":30.14,"location":2,"content":"and it's also a technology that has some really obvious commercial uses."},{"from":30.14,"to":32.66,"location":2,"content":"So it's an, it's an area that has attracted"},{"from":32.66,"to":36.27,"location":2,"content":"a lot of attention in the last couple of years."},{"from":36.27,"to":38.79,"location":2,"content":"So this is the overall plan."},{"from":38.79,"to":43.97,"location":2,"content":"Um, just a couple of reminders and things at the beginning about final project stuff,"},{"from":43.97,"to":48.88,"location":2,"content":"and then we'll, basically all of it is talking about question-answering starting with, um,"},{"from":48.88,"to":53,"location":2,"content":"motivation history, um, talking about the SQuAD data,"},{"from":53,"to":56.39,"location":2,"content":"uh, a particular simple model, our Stanford Attentive Reader."},{"from":56.39,"to":58.94,"location":2,"content":"Then talking about some other more complex,"},{"from":58.94,"to":62.46,"location":2,"content":"um, stuff into the most modern stuff."},{"from":62.46,"to":65.81,"location":2,"content":"Um, yeah, so in a census, um,"},{"from":65.81,"to":69.36,"location":2,"content":"lecture serves a double purpose because if you're going to do the,"},{"from":69.36,"to":71.39,"location":2,"content":"the default final project, well,"},{"from":71.39,"to":73.41,"location":2,"content":"it's about textual question-answering,"},{"from":73.41,"to":77.86,"location":2,"content":"and this is your chance to learn something about the area of textual question-answering,"},{"from":77.86,"to":81.41,"location":2,"content":"and the kinds of models you might want to be thinking about and building."},{"from":81.41,"to":84.89,"location":2,"content":"Um but the content of this lecture pretty much is in"},{"from":84.89,"to":88.92,"location":2,"content":"no way specifically tied to the default final project,"},{"from":88.92,"to":92.72,"location":2,"content":"apart from by subject matter that really it's telling you about"},{"from":92.72,"to":97.58,"location":2,"content":"how people use neural nets to build question-answering systems."},{"from":97.58,"to":101.2,"location":2,"content":"Okay. So first just quickly on the reminders,"},{"from":101.2,"to":103.05,"location":2,"content":"um, mid-quarter survey."},{"from":103.05,"to":105.15,"location":2,"content":"I mean, a huge number of people,"},{"from":105.15,"to":107.33,"location":2,"content":"um, have actually filled this in already."},{"from":107.33,"to":111.14,"location":2,"content":"Uh, we already had over 60 percent, um, um,"},{"from":111.14,"to":114.17,"location":2,"content":"filling-it-in rate by which by the standards of people"},{"from":114.17,"to":117.25,"location":2,"content":"who do surveys they come as a huge success already."},{"from":117.25,"to":119.51,"location":2,"content":"But if you're not in that percent, um,"},{"from":119.51,"to":123.48,"location":2,"content":"we'd still love to have your feedback and now's the perfect time to do it."},{"from":123.48,"to":125.52,"location":2,"content":"Um, yeah."},{"from":125.52,"to":129.46,"location":2,"content":"I just wanted to sort of have a note on custom final projects."},{"from":129.46,"to":131.39,"location":2,"content":"Um, so in general, um,"},{"from":131.39,"to":134.65,"location":2,"content":"it's great to get feedback on custom final projects."},{"from":134.65,"to":136.91,"location":2,"content":"There's a formal mechanism for that which is"},{"from":136.91,"to":139.63,"location":2,"content":"the project proposal that I mentioned last time."},{"from":139.63,"to":142.33,"location":2,"content":"It's also great to chat to people,"},{"from":142.33,"to":145.93,"location":2,"content":"um, informally about, um, final projects."},{"from":145.93,"to":148.69,"location":2,"content":"And so I'm one of those people and I have"},{"from":148.69,"to":151.61,"location":2,"content":"been talking to lots of people about final projects,"},{"from":151.61,"to":153.46,"location":2,"content":"and, uh, very happy to do so."},{"from":153.46,"to":156.5,"location":2,"content":"But there's sort of a problem that there's only one of me."},{"from":156.5,"to":158.63,"location":2,"content":"Um, so I do also, um,"},{"from":158.63,"to":162.08,"location":2,"content":"encourage you to realize that among the various TAs that"},{"from":162.08,"to":166.07,"location":2,"content":"really lots of them have had experience of different deep learning projects,"},{"from":166.07,"to":168.62,"location":2,"content":"and in particular on the office hours page,"},{"from":168.62,"to":173.42,"location":2,"content":"there's a table that's like this but you can read it if you look at it on your own laptop,"},{"from":173.42,"to":177.13,"location":2,"content":"which talks about the experience of different TA's."},{"from":177.13,"to":179.93,"location":2,"content":"And many of them have experience in different areas,"},{"from":179.93,"to":184.93,"location":2,"content":"and many of them are also good people to talk to about final projects."},{"from":184.93,"to":190.86,"location":2,"content":"Okay. Um, so for the default final project, the textual question-answering."},{"from":190.86,"to":195.19,"location":2,"content":"So um, draft materials for that app today,"},{"from":195.19,"to":197.34,"location":2,"content":"um, right now on the website actually."},{"from":197.34,"to":200.72,"location":2,"content":"Um, we're calling them draft because we think that there are still"},{"from":200.72,"to":204.23,"location":2,"content":"probably a few things that are gonna get changed over the next week,"},{"from":204.23,"to":209.84,"location":2,"content":"so um, don't regard as completely final in terms of the code that,"},{"from":209.84,"to":212.12,"location":2,"content":"you know, it's sort of 90 percent final."},{"from":212.12,"to":215.41,"location":2,"content":"So in terms of deciding whether you're going to do, um,"},{"from":215.41,"to":218.5,"location":2,"content":"a custom final project or a default final project,"},{"from":218.5,"to":221.47,"location":2,"content":"and working out what you're putting into your project proposal."},{"from":221.47,"to":222.75,"location":2,"content":"Um, it should be, you know,"},{"from":222.75,"to":224.27,"location":2,"content":"well more than, um,"},{"from":224.27,"to":226.47,"location":2,"content":"what you need for this year."},{"from":226.47,"to":228.67,"location":2,"content":"Okay. The one other, um,"},{"from":228.67,"to":232.04,"location":2,"content":"final bit I just wanted to say that I didn't get to"},{"from":232.04,"to":235.52,"location":2,"content":"last time is so for the final projects,"},{"from":235.52,"to":238.06,"location":2,"content":"regardless of which kind you're doing,"},{"from":238.06,"to":240.75,"location":2,"content":"um, well, part of it is, um,"},{"from":240.75,"to":242.54,"location":2,"content":"doing some experiments, of"},{"from":242.54,"to":244.7,"location":2,"content":"doing stuff with data and code,"},{"from":244.7,"to":246.88,"location":2,"content":"and getting some numbers and things like that."},{"from":246.88,"to":248.48,"location":2,"content":"But I do really, um,"},{"from":248.48,"to":251.63,"location":2,"content":"encourage people to also remember that an important part of"},{"from":251.63,"to":255.51,"location":2,"content":"the final project is writing a final project report."},{"from":255.51,"to":260.9,"location":2,"content":"And this is no different to any research project of the kinds that,"},{"from":260.9,"to":265.59,"location":2,"content":"um, students do for conferences or journals and things like that, right?"},{"from":265.59,"to":270.02,"location":2,"content":"You spend months commonly working over your code and experiments."},{"from":270.02,"to":271.97,"location":2,"content":"But in most cases,"},{"from":271.97,"to":276.37,"location":2,"content":"the main evaluation of your work is from people reading,"},{"from":276.37,"to":279.2,"location":2,"content":"a written paper output version of things."},{"from":279.2,"to":281.42,"location":2,"content":"So it's really important that,"},{"from":281.42,"to":284.48,"location":2,"content":"that paper version sort of reflects the work"},{"from":284.48,"to":287.84,"location":2,"content":"that you did and the interesting ideas that you came up with,"},{"from":287.84,"to":290.72,"location":2,"content":"and explains them well and present your experiments,"},{"from":290.72,"to":292.1,"location":2,"content":"and all of those things."},{"from":292.1,"to":296.67,"location":2,"content":"And so we encourage you to sort of do a good job at writing up your projects."},{"from":296.67,"to":299.68,"location":2,"content":"Um, here is just sort of a vague outline of, you know,"},{"from":299.68,"to":303.32,"location":2,"content":"what a typical project write-up is likely to look like."},{"from":303.32,"to":306.62,"location":2,"content":"Now, there isn't really one size completely fits all"},{"from":306.62,"to":309.95,"location":2,"content":"because depending on what you've done different things might be appropriate."},{"from":309.95,"to":311.99,"location":2,"content":"But, you know, typically the first page,"},{"from":311.99,"to":315.9,"location":2,"content":"you'll have an abstract for the paper and the introduction to the paper."},{"from":315.9,"to":319.22,"location":2,"content":"You'll spend some time talking about related prior work."},{"from":319.22,"to":323.62,"location":2,"content":"Um, you'll talk about what kind of models you built for a while."},{"from":323.62,"to":328.56,"location":2,"content":"Um, there's probably some discussion of what data you are using for your projects."},{"from":328.56,"to":334.92,"location":2,"content":"Um, experiments commonly with some tables and figures about the things that you're doing."},{"from":334.92,"to":339.74,"location":2,"content":"Um, more tables and figures talking about the results as to how well your systems work."},{"from":339.74,"to":343.01,"location":2,"content":"Um, it's great to have some error analysis to see"},{"from":343.01,"to":346.29,"location":2,"content":"what kind of things that you got right and wrong,"},{"from":346.29,"to":348.5,"location":2,"content":"and then maybe at the end there's sort of"},{"from":348.5,"to":351.96,"location":2,"content":"plans for the future, conclusions, or something like that."},{"from":351.96,"to":359.48,"location":2,"content":"Okay. Um, that's sort of it for my extra administrative reminders."},{"from":359.48,"to":363.47,"location":2,"content":"Um, are there any questions on final projects that people are dying to know?"},{"from":363.47,"to":369.8,"location":2,"content":"[NOISE] Okay. Good luck."},{"from":369.8,"to":370.93,"location":2,"content":"I just meant to say good luck."},{"from":370.93,"to":373.47,"location":2,"content":"Yeah. Good luck with your final projects. [LAUGHTER] Okay."},{"from":373.47,"to":375.38,"location":2,"content":"So now moving into,"},{"from":375.38,"to":378.55,"location":2,"content":"um, yeah, the question answering."},{"from":378.55,"to":383.17,"location":2,"content":"Okay. So, I mean- so question answering is"},{"from":383.17,"to":388.61,"location":2,"content":"a very direct application for something that human beings,"},{"from":388.61,"to":390.1,"location":2,"content":"um, want to do."},{"from":390.1,"to":393.62,"location":2,"content":"Um, well, maybe human beings don't in general want to know this."},{"from":393.62,"to":397.36,"location":2,"content":"Um, here's my query of \"Who was Australia's third prime minister?\"."},{"from":397.36,"to":399.5,"location":2,"content":"Um, maybe, yeah, that's not really the kind of"},{"from":399.5,"to":401.64,"location":2,"content":"thing you're gonna put into your queries but,"},{"from":401.64,"to":403.14,"location":2,"content":"you know, maybe you query,"},{"from":403.14,"to":405.11,"location":2,"content":"\"Who was the lead singer of Big Thief?\""},{"from":405.11,"to":406.75,"location":2,"content":"or something like that. I don't know."},{"from":406.75,"to":408.05,"location":2,"content":"Um, you're, uh, but you know,"},{"from":408.05,"to":411.77,"location":2,"content":"lots- a large percentage of stuff [NOISE] on the web"},{"from":411.77,"to":416.09,"location":2,"content":"is that people actually are asking for answers to questions."},{"from":416.09,"to":419.12,"location":2,"content":"And so, if I put in this query into Google,"},{"from":419.12,"to":420.53,"location":2,"content":"it actually just works."},{"from":420.53,"to":423.92,"location":2,"content":"It tells me the answer is John Christian Watson."},{"from":423.92,"to":428.92,"location":2,"content":"And, um, so that's sort of question answering working in the real world."},{"from":428.92,"to":431.54,"location":2,"content":"Um, if you try different kinds of questions in Google,"},{"from":431.54,"to":434.58,"location":2,"content":"you'll find that some of them work and lots of them don't work."},{"from":434.58,"to":435.77,"location":2,"content":"And when they don't work,"},{"from":435.77,"to":440.09,"location":2,"content":"you're just sort of getting whatever kind of information retrieval, web search results."},{"from":440.09,"to":443.31,"location":2,"content":"Um, there is one fine point that I just wanted,"},{"from":443.31,"to":445.13,"location":2,"content":"um, to mention down here."},{"from":445.13,"to":448.79,"location":2,"content":"So another thing that Google has is the Google Knowledge Graph,"},{"from":448.79,"to":452.23,"location":2,"content":"which is a structured graph representation of knowledge."},{"from":452.23,"to":455.4,"location":2,"content":"And some kinds of questions,"},{"from":455.4,"to":459.08,"location":2,"content":"um, being answered from that structured knowledge representation."},{"from":459.08,"to":460.43,"location":2,"content":"And so, I mean,"},{"from":460.43,"to":463.02,"location":2,"content":"quite a lot of the time for things like movies,"},{"from":463.02,"to":464.87,"location":2,"content":"it's coming from that structured graph."},{"from":464.87,"to":467.69,"location":2,"content":"If you're sort of saying, \"Who's the director of a movie?\""},{"from":467.69,"to":468.89,"location":2,"content":"or something like that."},{"from":468.89,"to":471.05,"location":2,"content":"But this answer isn't coming from that."},{"from":471.05,"to":473,"location":2,"content":"This answer is a genuine,"},{"from":473,"to":475.4,"location":2,"content":"the kind of stuff we're gonna talk about today."},{"from":475.4,"to":479.36,"location":2,"content":"It's textual question answering from a web page where"},{"from":479.36,"to":481.58,"location":2,"content":"Google's question and answering system has"},{"from":481.58,"to":484.5,"location":2,"content":"extracted the answer and is sticking it up there."},{"from":484.5,"to":486.37,"location":2,"content":"Um, if you're, um,"},{"from":486.37,"to":489.49,"location":2,"content":"wanting to explore these things, um,"},{"from":489.49,"to":494.74,"location":2,"content":"if you get one of these boxes sort of down here where I've cut it off,"},{"from":494.74,"to":496.34,"location":2,"content":"there's a little bit of gray that says,"},{"from":496.34,"to":497.99,"location":2,"content":"\"How did I get this result?\"."},{"from":497.99,"to":499.42,"location":2,"content":"And if you click on that,"},{"from":499.42,"to":503.3,"location":2,"content":"it actually tells you what source it's getting it from and you can see if it's doing it"},{"from":503.3,"to":508.13,"location":2,"content":"from the textual question answering system or from something like the Knowledge Graph."},{"from":508.13,"to":511.04,"location":2,"content":"Okay. Um, so the- in general,"},{"from":511.04,"to":515.6,"location":2,"content":"the motivation for question answering is that these days there's"},{"from":515.6,"to":520.36,"location":2,"content":"just these sort of massive collections of full text documents,"},{"from":520.36,"to":522.11,"location":2,"content":"i.e., there's the web."},{"from":522.11,"to":526.58,"location":2,"content":"Um, so that there are sort of billions of documents of information."},{"from":526.58,"to":529.73,"location":2,"content":"And traditionally, when people first started"},{"from":529.73,"to":533.33,"location":2,"content":"thinking about search information retrieval as a field,"},{"from":533.33,"to":539.02,"location":2,"content":"you know, nothing of that kind of quantity and size existed, right?"},{"from":539.02,"to":542.32,"location":2,"content":"That when people first started building search systems,"},{"from":542.32,"to":545.2,"location":2,"content":"it was sort of unthinkable to index"},{"from":545.2,"to":549.34,"location":2,"content":"whole documents because no one had hard disks big enough in those days, right?"},{"from":549.34,"to":555.34,"location":2,"content":"That really- they were indexing titles or titles and abstracts or something like that."},{"from":555.34,"to":559.92,"location":2,"content":"And so, it seemed perfectly adequate in those days to say, \"Okay."},{"from":559.92,"to":562.76,"location":2,"content":"We're just gonna send you- give you your results.\""},{"from":562.76,"to":564.68,"location":2,"content":"as to \"Here's a list of documents.\""},{"from":564.68,"to":567.44,"location":2,"content":"because the documents are only a hundred words long."},{"from":567.44,"to":571.01,"location":2,"content":"But that's clearly not the case now when we have the sort of, you know,"},{"from":571.01,"to":576.27,"location":2,"content":"ten minute read, Medium posts um, which might have the answer to a question."},{"from":576.27,"to":579.08,"location":2,"content":"And so, there's this need to sort of say, \"Well,"},{"from":579.08,"to":583.21,"location":2,"content":"can we just have systems that will give us answers to questions?\"."},{"from":583.21,"to":589.73,"location":2,"content":"And a lot of the recent changes in technology have hugely underlined that need."},{"from":589.73,"to":594.95,"location":2,"content":"So, returning documents works okay if you're sitting at your laptop,"},{"from":594.95,"to":599.15,"location":2,"content":"but it works really terribly if you're on your phone and it works even more"},{"from":599.15,"to":604.04,"location":2,"content":"terribly if you're trying to work with speech on a digital assistant device,"},{"from":604.04,"to":606.11,"location":2,"content":"something like an Alexa system."},{"from":606.11,"to":608.84,"location":2,"content":"And so, we really want to actually be able to produce"},{"from":608.84,"to":612.26,"location":2,"content":"systems that can give the answers to people's questions."},{"from":612.26,"to":616.87,"location":2,"content":"And so typically, doing that is factored into two parts."},{"from":616.87,"to":621.5,"location":2,"content":"That the first part of that is we still do information retrieval."},{"from":621.5,"to":626.27,"location":2,"content":"We use stand- normally quite standard information retrieval techniques to"},{"from":626.27,"to":632.15,"location":2,"content":"find documents that quite likely to con- maintain- contain an answer."},{"from":632.15,"to":636.2,"location":2,"content":"And the reason that this is normally done by quite traditional techniques is because"},{"from":636.2,"to":641.39,"location":2,"content":"the traditional techniques are extremely scalable over billions of documents,"},{"from":641.39,"to":643.79,"location":2,"content":"whereas current neural systems actually"},{"from":643.79,"to":646.23,"location":2,"content":"aren't really scalable over billions of documents."},{"from":646.23,"to":650.38,"location":2,"content":"But that's an area in sort of which research is ongoing."},{"from":650.38,"to":653.92,"location":2,"content":"But then once we have sort of some candidate likely documents,"},{"from":653.92,"to":655.64,"location":2,"content":"we want to find, uh,"},{"from":655.64,"to":657.37,"location":2,"content":"do they contain an answer,"},{"from":657.37,"to":659.3,"location":2,"content":"and if so, what is the answer?"},{"from":659.3,"to":660.52,"location":2,"content":"And so at that point,"},{"from":660.52,"to":663.27,"location":2,"content":"we have a document or a paragraph,"},{"from":663.27,"to":667.45,"location":2,"content":"and we're saying, \"Can we answer this question from there?\""},{"from":667.45,"to":671.35,"location":2,"content":"And then that problem is often referred to as the Reading Comprehension problem."},{"from":671.35,"to":674.71,"location":2,"content":"And so that's really what I'm gonna focus on today."},{"from":674.71,"to":679.53,"location":2,"content":"Um, Reading Comprehension isn't a new problem."},{"from":679.53,"to":686.35,"location":2,"content":"I mean it- you can trace it back into the early days of artificial intelligence and NLP."},{"from":686.35,"to":688.29,"location":2,"content":"So, back in the 70's,"},{"from":688.29,"to":691.52,"location":2,"content":"a lot of NLP work was trying to do Reading Comprehension."},{"from":691.52,"to":695.42,"location":2,"content":"I mean one of the famous strands of that, um, was, um,"},{"from":695.42,"to":698.43,"location":2,"content":"Sir Roger Shank was a famous,"},{"from":698.43,"to":701.03,"location":2,"content":"um, early NLP person."},{"from":701.03,"to":702.65,"location":2,"content":"Though not a terribly nice man."},{"from":702.65,"to":703.99,"location":2,"content":"I don't think, actually."},{"from":703.99,"to":708.44,"location":2,"content":"Um, but the Yale School of AI was a very well-known,"},{"from":708.44,"to":711.83,"location":2,"content":"um, NLP approach and really,"},{"from":711.83,"to":715.38,"location":2,"content":"it was very focused on Reading Comprehension."},{"from":715.38,"to":718.21,"location":2,"content":"Um, but it's sort of,"},{"from":718.21,"to":721.07,"location":2,"content":"you know, I think it was sort of the time, it was too early in any way."},{"from":721.07,"to":723.73,"location":2,"content":"It sort of died out. Nothing much came out of that."},{"from":723.73,"to":727.67,"location":2,"content":"Um, but then in- right just before the turn of the mil- millennium,"},{"from":727.67,"to":731.15,"location":2,"content":"Lynette Hirschman revived this idea and said, \"Well,"},{"from":731.15,"to":734,"location":2,"content":"maybe a good challenge would be to find the kind of"},{"from":734,"to":738.15,"location":2,"content":"Reading Comprehension questions that elementary school kids do,"},{"from":738.15,"to":739.7,"location":2,"content":"and let's see if we could get,"},{"from":739.7,"to":741.5,"location":2,"content":"um, computers to do that."},{"from":741.5,"to":744.53,"location":2,"content":"And some people tried that with fairly simple methods,"},{"from":744.53,"to":746.69,"location":2,"content":"which only work mediocrely."},{"from":746.69,"to":749.18,"location":2,"content":"Then sort of somewhat after that, um,"},{"from":749.18,"to":751.46,"location":2,"content":"Chris Burges who was a guy who was at"},{"from":751.46,"to":754.61,"location":2,"content":"Microsoft Research and he wasn't really an NLP person at all."},{"from":754.61,"to":756.34,"location":2,"content":"He was a machine learning person,"},{"from":756.34,"to":759.07,"location":2,"content":"but he got it into his head, um,"},{"from":759.07,"to":763.82,"location":2,"content":"that while really a big problem that should be being worked on is"},{"from":763.82,"to":769.12,"location":2,"content":"Machine Comprehension and he suggested that you sort of could codify it like this."},{"from":769.12,"to":772.72,"location":2,"content":"And this is a particular clean codification"},{"from":772.72,"to":775.34,"location":2,"content":"that has lived on and we'll look at more today."},{"from":775.34,"to":778.88,"location":2,"content":"All right. So, a machine comprehends a passage of text."},{"from":778.88,"to":781.64,"location":2,"content":"If there's any question regarding that text that can be"},{"from":781.64,"to":784.49,"location":2,"content":"answered correctly by a majority of native speakers,"},{"from":784.49,"to":786.89,"location":2,"content":"that machine can provide a string,"},{"from":786.89,"to":789.47,"location":2,"content":"which those speakers would agree both answers"},{"from":789.47,"to":793.57,"location":2,"content":"that question and does not contain information irrelevant to that question."},{"from":793.57,"to":797.75,"location":2,"content":"Um, and he sort of proposed this as sort of a challenge problem for"},{"from":797.75,"to":801.98,"location":2,"content":"artificial intelligence and set about collecting a corpus,"},{"from":801.98,"to":807.41,"location":2,"content":"the MCTest corpus, which was meant to be a simple Reading Comprehension challenge."},{"from":807.41,"to":809.86,"location":2,"content":"Um, so they collected, um,"},{"from":809.86,"to":812.84,"location":2,"content":"stories, um, which, um,"},{"from":812.84,"to":815.51,"location":2,"content":"were meant to be kids' stories, you know."},{"from":815.51,"to":817.79,"location":2,"content":"\"Alyssa got to the beach after a long trip."},{"from":817.79,"to":820.01,"location":2,"content":"She's from Charlotte. She traveled from Atlanta."},{"from":820.01,"to":821.57,"location":2,"content":"She's now in Miami\"."},{"from":821.57,"to":823.5,"location":2,"content":"Sort of pretty easy stuff."},{"from":823.5,"to":825.18,"location":2,"content":"And then there were questions."},{"from":825.18,"to":827.79,"location":2,"content":"\"Why did Alyssa go to Miami?\""},{"from":827.79,"to":829.89,"location":2,"content":"Um, and then the answer is,"},{"from":829.89,"to":831.32,"location":2,"content":"\"To visit some friends\"."},{"from":831.32,"to":835.13,"location":2,"content":"And so you've got there this string that is coming from the passage."},{"from":835.13,"to":837.51,"location":2,"content":"That's the answer to the question."},{"from":837.51,"to":840.95,"location":2,"content":"Um, so the MCTest is a corpus of"},{"from":840.95,"to":847.16,"location":2,"content":"about 600 such stories and that challenge existed, and a few people worked on it."},{"from":847.16,"to":851.24,"location":2,"content":"But that never really went very far either for the next couple of years."},{"from":851.24,"to":855.35,"location":2,"content":"But what really changed things was that in 2015,"},{"from":855.35,"to":858.51,"location":2,"content":"and then with more stuff in 2016,"},{"from":858.51,"to":863,"location":2,"content":"um, deep learning people got interested in this idea of,"},{"from":863,"to":867.62,"location":2,"content":"\"Could we perhaps build neural question answering systems?\""},{"from":867.62,"to":870.97,"location":2,"content":"And it seemed like if you wanted to do that, um,"},{"from":870.97,"to":873.98,"location":2,"content":"something like MCTest could only be a test set"},{"from":873.98,"to":878.24,"location":2,"content":"and the ways to make progress would be to do what had been done"},{"from":878.24,"to":885.68,"location":2,"content":"in other domains and to actually build just- hand build a large training set of passages,"},{"from":885.68,"to":890.87,"location":2,"content":"questions, and answers in such a way that would be able to train neural networks using"},{"from":890.87,"to":893.6,"location":2,"content":"the kind of supervised learning techniques that we've"},{"from":893.6,"to":896.54,"location":2,"content":"concentrated on so far in this class."},{"from":896.54,"to":900.45,"location":2,"content":"And indeed, the kind of supervised neural network learning techniques,"},{"from":900.45,"to":902.99,"location":2,"content":"which is [NOISE] actually the successful stuff that"},{"from":902.99,"to":906.5,"location":2,"content":"powers nearly all the applications of deep learning,"},{"from":906.5,"to":907.96,"location":2,"content":"not only in NLP,"},{"from":907.96,"to":910.2,"location":2,"content":"but also in other fields like vision."},{"from":910.2,"to":915.68,"location":2,"content":"Um, and so the first subs- the first such dataset was built by"},{"from":915.68,"to":920.99,"location":2,"content":"people at DeepMind over CNN and Daily Mail news stories."},{"from":920.99,"to":923.3,"location":2,"content":"Um, but then the next year, um,"},{"from":923.3,"to":926.27,"location":2,"content":"Pranav Rajpurkar is a Stanford PhD student"},{"from":926.27,"to":929.27,"location":2,"content":"working with Percy Liang and a couple of other students, um,"},{"from":929.27,"to":931.05,"location":2,"content":"produced the SQuAD dataset,"},{"from":931.05,"to":934.58,"location":2,"content":"which was actually a much better designed dataset and proved to be"},{"from":934.58,"to":938.12,"location":2,"content":"sort of much more successful at driving this forward."},{"from":938.12,"to":939.83,"location":2,"content":"And then following along from that,"},{"from":939.83,"to":942.77,"location":2,"content":"other people started to produce lots of other,"},{"from":942.77,"to":945.59,"location":2,"content":"um, question answering datasets which, you know,"},{"from":945.59,"to":948.22,"location":2,"content":"many of them have interesting advantages"},{"from":948.22,"to":951.32,"location":2,"content":"and disadvantages of their own including MS MARCO,"},{"from":951.32,"to":953.81,"location":2,"content":"TriviaQA, RACE, blah, blah, blah, lots of them."},{"from":953.81,"to":955.76,"location":2,"content":"Um, but for today's class,"},{"from":955.76,"to":958.1,"location":2,"content":"I'm gonna concentrate on SQuAD,"},{"from":958.1,"to":963.89,"location":2,"content":"because SQuAD is actually the one that has been by far the most widely used."},{"from":963.89,"to":970.14,"location":2,"content":"And because it - it was just a well-constructed clean dataset,"},{"from":970.14,"to":973.67,"location":2,"content":"that it sort of just proved a profitable one for people to work with."},{"from":973.67,"to":977.26,"location":2,"content":"[NOISE]"},{"from":977.26,"to":980.23,"location":2,"content":"Okay. Um, so, that was reading comprehension."},{"from":980.23,"to":983.05,"location":2,"content":"I'll also just quickly tell you the, um,"},{"from":983.05,"to":986.49,"location":2,"content":"the history of open domain question answering."},{"from":986.49,"to":989.08,"location":2,"content":"So, the difference here for the- the field of"},{"from":989.08,"to":993.3,"location":2,"content":"Open-domain Question Answering that we're saying, okay,"},{"from":993.3,"to":997.35,"location":2,"content":"there's an encyclopedia or there's a web crawl,"},{"from":997.35,"to":999.4,"location":2,"content":"I'm just going to ask a question,"},{"from":999.4,"to":1000.56,"location":2,"content":"can you answer it?"},{"from":1000.56,"to":1003.55,"location":2,"content":"So, it's this bigger task of question answering."},{"from":1003.55,"to":1006.57,"location":2,"content":"And, you know, that was something that again was thought about,"},{"from":1006.57,"to":1009,"location":2,"content":"um, very early on."},{"from":1009,"to":1011.46,"location":2,"content":"So, there's this kind of early, um,"},{"from":1011.46,"to":1016.17,"location":2,"content":"CACM paper by Simmons who sort of explores how you could"},{"from":1016.17,"to":1020.94,"location":2,"content":"do answering questions as textual question-answering, um, and yet, you know,"},{"from":1020.94,"to":1023.01,"location":2,"content":"he has the idea that what's going to"},{"from":1023.01,"to":1025.76,"location":2,"content":"happen is you're gonna dependency parse the question,"},{"from":1025.76,"to":1028.47,"location":2,"content":"and dependency parse sentences of the text,"},{"from":1028.47,"to":1031.83,"location":2,"content":"and then sort of do tree matching over the dependency parses,"},{"from":1031.83,"to":1033.66,"location":2,"content":"um, to get out the answers."},{"from":1033.66,"to":1035.87,"location":2,"content":"And, you know, that's in some sense"},{"from":1035.87,"to":1042.12,"location":2,"content":"actually prefigured work that people actually were then attempting to do 35 years later."},{"from":1042.12,"to":1045.57,"location":2,"content":"Um, getting a bit more modern, um, Julian Kupiec,"},{"from":1045.57,"to":1048,"location":2,"content":"she was working at Xerox PARC at the time,"},{"from":1048,"to":1051.24,"location":2,"content":"um, came up with this system called MURAX,"},{"from":1051.24,"to":1055.89,"location":2,"content":"and so at this stage in the 90s there started to be the first, um,"},{"from":1055.89,"to":1058.77,"location":2,"content":"digitally available encyclopedias available,"},{"from":1058.77,"to":1061.28,"location":2,"content":"so he was using the Grolier's Encyclopedia,"},{"from":1061.28,"to":1064.56,"location":2,"content":"and so he said about trying to build a system that could answer"},{"from":1064.56,"to":1067.98,"location":2,"content":"questions over that encyclopedia using,"},{"from":1067.98,"to":1070.59,"location":2,"content":"in general, fairly sort of shallow, um,"},{"from":1070.59,"to":1075.43,"location":2,"content":"linguistic processing methods, i.e, regular expressions."},{"from":1075.43,"to":1078.21,"location":2,"content":"Um, for, after [LAUGHTER] having, um,"},{"from":1078.21,"to":1081.56,"location":2,"content":"done information retrieval search over that."},{"from":1081.56,"to":1085.52,"location":2,"content":"But that started to evoke more interest from other people,"},{"from":1085.52,"to":1093.13,"location":2,"content":"and so in 1999 the US National Institutes of Standards and Technology, um,"},{"from":1093.13,"to":1097.17,"location":2,"content":"instituted a TREC question-answering track where the idea was,"},{"from":1097.17,"to":1101.14,"location":2,"content":"there was a large collection of News-wire documents,"},{"from":1101.14,"to":1105.09,"location":2,"content":"and you could be asked to provide the question of them,"},{"from":1105.09,"to":1108.39,"location":2,"content":"and lots of people started to build question answering systems."},{"from":1108.39,"to":1110.85,"location":2,"content":"Indeed, if in some sense that was"},{"from":1110.85,"to":1115.56,"location":2,"content":"this competition which was where people at IBM started,"},{"from":1115.56,"to":1118.32,"location":2,"content":"um, working on textual question-answering,"},{"from":1118.32,"to":1122.01,"location":2,"content":"and then, um, sort of a decade later, um,"},{"from":1122.01,"to":1127.31,"location":2,"content":"IBM rejigged things into the sexier format of,"},{"from":1127.31,"to":1132.97,"location":2,"content":"um, let's build a Jeopardy contestant rather than let's answer questions from the news,"},{"from":1132.97,"to":1136.62,"location":2,"content":"and that then led to their DeepQA system in 2011."},{"from":1136.62,"to":1139.15,"location":2,"content":"Which I presume quite a few of you saw,"},{"from":1139.15,"to":1142.55,"location":2,"content":"these people saw Jeopardy IBM?"},{"from":1142.55,"to":1144.12,"location":2,"content":"Yeah, some of you."},{"from":1144.12,"to":1147.19,"location":2,"content":"Okay. So, that they were able to successfully, um,"},{"from":1147.19,"to":1153.18,"location":2,"content":"build a question answering system that could compete at Jeopardy, um, and win."},{"from":1153.18,"to":1157.71,"location":2,"content":"Um, and, you know, like a lot of these demonstrations of"},{"from":1157.71,"to":1163.95,"location":2,"content":"technological success there are things you can quibble about the way it was set up,"},{"from":1163.95,"to":1167.25,"location":2,"content":"um, that really the kind of computer just had"},{"from":1167.25,"to":1172.26,"location":2,"content":"a speed advantage versus the human beings that had to buzz in to answer the question."},{"from":1172.26,"to":1174.94,"location":2,"content":"But, you know, nevertheless, fundamentally,"},{"from":1174.94,"to":1177.54,"location":2,"content":"the textual question-answering had to work,"},{"from":1177.54,"to":1182.89,"location":2,"content":"that this was a system that was answering questions mainly based on textual passages,"},{"from":1182.89,"to":1187.08,"location":2,"content":"and it had to be able to find the answers to those questions correctly,"},{"from":1187.08,"to":1188.79,"location":2,"content":"for the system to work."},{"from":1188.79,"to":1192.09,"location":2,"content":"Um, so then, more recently again, um,"},{"from":1192.09,"to":1195.99,"location":2,"content":"and really the first piece of work that did this with a neural system was,"},{"from":1195.99,"to":1198,"location":2,"content":"um, work that was, um,"},{"from":1198,"to":1199.65,"location":2,"content":"done by a Stanford PhD student,"},{"from":1199.65,"to":1200.92,"location":2,"content":"that I'll get to later,"},{"from":1200.92,"to":1202.35,"location":2,"content":"was then the idea of well,"},{"from":1202.35,"to":1206.94,"location":2,"content":"could we replace traditional complex question answering systems"},{"from":1206.94,"to":1209.95,"location":2,"content":"by using a neural reading comprehension system,"},{"from":1209.95,"to":1212.28,"location":2,"content":"and that's proved to be very successful."},{"from":1212.28,"to":1215.97,"location":2,"content":"So, to, to explain that a little bit more, um,"},{"from":1215.97,"to":1220.41,"location":2,"content":"if you look at the kind of systems that were built for TREC question-answering,"},{"from":1220.41,"to":1224.64,"location":2,"content":"um, they were very complex multi-part systems."},{"from":1224.64,"to":1227.57,"location":2,"content":"And really, if you then look at something like,"},{"from":1227.57,"to":1231.51,"location":2,"content":"IBM's Deep QA system it was sort of like this"},{"from":1231.51,"to":1235.95,"location":2,"content":"times 10 because it both had very complex systems like this,"},{"from":1235.95,"to":1240.46,"location":2,"content":"but it ensembled together sort of six different components in every place,"},{"from":1240.46,"to":1241.86,"location":2,"content":"and then did sort of,"},{"from":1241.86,"to":1245.22,"location":2,"content":"um, classify a combination on top of them."},{"from":1245.22,"to":1246.66,"location":2,"content":"But so far, the current-."},{"from":1246.66,"to":1251.85,"location":2,"content":"This is sort of around a sort of a 2003 question answering system,"},{"from":1251.85,"to":1255.12,"location":2,"content":"and so the kind of things that went through is,"},{"from":1255.12,"to":1256.98,"location":2,"content":"so when there was a question,"},{"from":1256.98,"to":1259.47,"location":2,"content":"it parsed the question with a parser"},{"from":1259.47,"to":1262.38,"location":2,"content":"kind of like the ones we saw with our dependency parsers."},{"from":1262.38,"to":1263.88,"location":2,"content":"It did some sort of"},{"from":1263.88,"to":1269.43,"location":2,"content":"handwritten semantic normalization rules to try and get them into a better semantic form."},{"from":1269.43,"to":1273.14,"location":2,"content":"It then had a question type classifier which tried to"},{"from":1273.14,"to":1276.89,"location":2,"content":"work out what kind of semantic type is this question looking for,"},{"from":1276.89,"to":1278.78,"location":2,"content":"is it looking for a person name,"},{"from":1278.78,"to":1279.89,"location":2,"content":"or a country name,"},{"from":1279.89,"to":1282.86,"location":2,"content":"or a temperature, or something like that."},{"from":1282.86,"to":1287.83,"location":2,"content":"Um, it would, um, then, um,"},{"from":1287.83,"to":1292.28,"location":2,"content":"have an information retrieval system out of the document collection,"},{"from":1292.28,"to":1297.57,"location":2,"content":"um, which would find paragraphs that were likely to contain the answers."},{"from":1297.57,"to":1300.51,"location":2,"content":"Um, and then it would have a method of ranking"},{"from":1300.51,"to":1305.17,"location":2,"content":"those paragraph choices to see which ones are likely to have the answers."},{"from":1305.17,"to":1307.74,"location":2,"content":"Um, it would then,"},{"from":1307.74,"to":1310.37,"location":2,"content":"um, over there somewhere, um,"},{"from":1310.37,"to":1316.32,"location":2,"content":"run Named Entity Recognition on those passages to find entities that were in them."},{"from":1316.32,"to":1319.52,"location":2,"content":"These systems depended strongly on the use of"},{"from":1319.52,"to":1322.35,"location":2,"content":"fine matching entities because then it could look for"},{"from":1322.35,"to":1325.76,"location":2,"content":"an entity which corresponded to the question type."},{"from":1325.76,"to":1329.97,"location":2,"content":"Um, then once it had candidate entities,"},{"from":1329.97,"to":1331.98,"location":2,"content":"it had to actually try and determine whether"},{"from":1331.98,"to":1334.98,"location":2,"content":"these entities did or didn't answer the question."},{"from":1334.98,"to":1338.74,"location":2,"content":"So, these people, this is the system from LCC by,"},{"from":1338.74,"to":1341.1,"location":2,"content":"um, Sanda Harabagiu and Dan Moldovan."},{"from":1341.1,"to":1343.61,"location":2,"content":"They actually had some quite interesting stuff here,"},{"from":1343.61,"to":1348.9,"location":2,"content":"where they had a kind of a loose theorem prover that would try and prove that, um,"},{"from":1348.9,"to":1351.51,"location":2,"content":"the semantic form of a piece of text,"},{"from":1351.51,"to":1354.12,"location":2,"content":"um, gave an answer to what the question was."},{"from":1354.12,"to":1358.41,"location":2,"content":"So, you know, that was kind of cool stuff with an Axiomatic Knowledge Base,"},{"from":1358.41,"to":1361.28,"location":2,"content":"um, and eventually out would come an answer."},{"from":1361.28,"to":1364.31,"location":2,"content":"Um, so, you know, something that is,"},{"from":1364.31,"to":1366.3,"location":2,"content":"I do just want to emphasize, you know,"},{"from":1366.3,"to":1370.05,"location":2,"content":"sometimes with these deep learning courses you get these days,"},{"from":1370.05,"to":1375.33,"location":2,"content":"the impression you have is that absolutely nothing worked before 2014,"},{"from":1375.33,"to":1377.44,"location":2,"content":"uh, when we got back to deep learning,"},{"from":1377.44,"to":1379.44,"location":2,"content":"and that's not actually true."},{"from":1379.44,"to":1381.57,"location":2,"content":"So, these kind of factoid question on,"},{"from":1381.57,"to":1383.97,"location":2,"content":"these kind of question answering systems within"},{"from":1383.97,"to":1387.13,"location":2,"content":"a certain domain actually really worked rather well."},{"from":1387.13,"to":1390.69,"location":2,"content":"Um, so, I started saying the word Factoid Question Answering,"},{"from":1390.69,"to":1393.12,"location":2,"content":"and so let me explain that because that's the secret."},{"from":1393.12,"to":1394.86,"location":2,"content":"So, people, at least in NLP,"},{"from":1394.86,"to":1397.96,"location":2,"content":"use the term \"Factoid Question Answering\" to mean"},{"from":1397.96,"to":1401.79,"location":2,"content":"the case that your answer is a named entity."},{"from":1401.79,"to":1403.89,"location":2,"content":"So, it's sort of something like, you know,"},{"from":1403.89,"to":1406.21,"location":2,"content":"what year was Elvis Presley born,"},{"from":1406.21,"to":1412.05,"location":2,"content":"or what is the name of Beyonce's husband, or, um,"},{"from":1412.05,"to":1415.32,"location":2,"content":"you know, which state,"},{"from":1415.32,"to":1418.74,"location":2,"content":"um, has the most pork or something, I don't know."},{"from":1418.74,"to":1420.24,"location":2,"content":"Right, anything that's got,"},{"from":1420.24,"to":1425.2,"location":2,"content":"anything that's sort of the answer is sort of some clear semantic type entity,"},{"from":1425.2,"to":1426.73,"location":2,"content":"and that's your answer."},{"from":1426.73,"to":1430.93,"location":2,"content":"I mean, so, within the space of those kind of questions,"},{"from":1430.93,"to":1435.19,"location":2,"content":"which actually is a significant part of the questions you get in web search, right?"},{"from":1435.19,"to":1438.63,"location":2,"content":"Lots of web search is just, you know,"},{"from":1438.63,"to":1441.12,"location":2,"content":"who was the star of this movie,"},{"from":1441.12,"to":1443.36,"location":2,"content":"or what year was somebody born, right?"},{"from":1443.36,"to":1445.79,"location":2,"content":"There's zillions of those all the time."},{"from":1445.79,"to":1448.71,"location":2,"content":"These systems actually really did work quite well"},{"from":1448.71,"to":1452.07,"location":2,"content":"that they could get about 70 percent of those questions right,"},{"from":1452.07,"to":1454.11,"location":2,"content":"um, which wasn't bad at all, um,"},{"from":1454.11,"to":1456.27,"location":2,"content":"though that they really sort of didn't really"},{"from":1456.27,"to":1459.38,"location":2,"content":"extend it out to other kinds of stuff beyond that."},{"from":1459.38,"to":1462.4,"location":2,"content":"But whatever virtues they had, um,"},{"from":1462.4,"to":1468.28,"location":2,"content":"they were extremely complex systems that people spent years put togeth- putting together,"},{"from":1468.28,"to":1472.88,"location":2,"content":"which had many components with a huge amount of hand-built stuff."},{"from":1472.88,"to":1479.04,"location":2,"content":"And most of the stuff was sort of built quite separately and tied together,"},{"from":1479.04,"to":1481.12,"location":2,"content":"and you just sort of hope that it worked,"},{"from":1481.12,"to":1484.05,"location":2,"content":"um, well, when put together in composite."},{"from":1484.05,"to":1487.69,"location":2,"content":"And so we can contrast that to what we then see later,"},{"from":1487.69,"to":1491.28,"location":2,"content":"um, for neural network-style systems."},{"from":1491.28,"to":1497.35,"location":2,"content":"Okay. Um, so let me now say some more stuff about, um,"},{"from":1497.35,"to":1502.87,"location":2,"content":"the Stanford Question Answering Dataset or SQuAD that I just mentioned a little bit ago,"},{"from":1502.87,"to":1507.06,"location":2,"content":"and as this is the data for the default final project as well."},{"from":1507.06,"to":1510.04,"location":2,"content":"Um, so what SQuAD has is,"},{"from":1510.04,"to":1513.49,"location":2,"content":"questions in SQuAD have a passage,"},{"from":1513.49,"to":1516.07,"location":2,"content":"which is a paragraph from Wikipedia."},{"from":1516.07,"to":1518.42,"location":2,"content":"And then there is a question,"},{"from":1518.42,"to":1521.76,"location":2,"content":"here it's, \"Which team won Super Bowl 50?\""},{"from":1521.76,"to":1527.27,"location":2,"content":"And the goal of the system is to come up with the answer to this question."},{"from":1527.27,"to":1530.43,"location":2,"content":"Um, human reading comprehension."},{"from":1530.43,"to":1532.35,"location":2,"content":"What is the answer to the question?"},{"from":1532.35,"to":1536.64,"location":2,"content":"[NOISE]"},{"from":1536.64,"to":1537.51,"location":2,"content":"Broncos."},{"from":1537.51,"to":1539.13,"location":2,"content":"Broncos. [LAUGHTER] Okay."},{"from":1539.13,"to":1542.73,"location":2,"content":"Yeah. Um, so that's the answer to the question."},{"from":1542.73,"to":1547.06,"location":2,"content":"Um, and so by construction for SQuAD,"},{"from":1547.06,"to":1553.57,"location":2,"content":"the answer to a question is always a sub-sequence of words from the passage which is,"},{"from":1553.57,"to":1556.35,"location":2,"content":"normally, it ends up being referred to as a span,"},{"from":1556.35,"to":1558.58,"location":2,"content":"a sub-sequence of words from the passage."},{"from":1558.58,"to":1561.67,"location":2,"content":"So that's the only kind of questions you can have."},{"from":1561.67,"to":1564.64,"location":2,"content":"You can't have questions that are counting questions,"},{"from":1564.64,"to":1567.13,"location":2,"content":"or yes, no questions, or anything like that."},{"from":1567.13,"to":1570.47,"location":2,"content":"You can just pick out a sub-sequence."},{"from":1570.47,"to":1572.26,"location":2,"content":"Um, okay."},{"from":1572.26,"to":1578.65,"location":2,"content":"But, um, so they created in the first version about 100,000 examples."},{"from":1578.65,"to":1582.04,"location":2,"content":"So there are a bunch of questions about each passage."},{"from":1582.04,"to":1584.2,"location":2,"content":"So it's sort of something like, um,"},{"from":1584.2,"to":1588.58,"location":2,"content":"I think it's maybe sort of about five questions per passage,"},{"from":1588.58,"to":1592.32,"location":2,"content":"and there are 20,000 different bits that Wikipedia uses, used."},{"from":1592.32,"to":1594.91,"location":2,"content":"Um, and this sort of must be a span form,"},{"from":1594.91,"to":1599.26,"location":2,"content":"as often referred to as extractive question answering."},{"from":1599.26,"to":1603.52,"location":2,"content":"Okay. Um, here's just one more example"},{"from":1603.52,"to":1607.54,"location":2,"content":"that can give you some more sense of some of the things that are there,"},{"from":1607.54,"to":1610.35,"location":2,"content":"and it illustrates a couple of other factors."},{"from":1610.35,"to":1612.76,"location":2,"content":"Um, so, you know,"},{"from":1612.76,"to":1616.36,"location":2,"content":"even this one, I guess the previous one wasn't, um,"},{"from":1616.36,"to":1619.6,"location":2,"content":"completely obvious what your answers should be because"},{"from":1619.6,"to":1622.9,"location":2,"content":"maybe you could say the answer should just have been Broncos,"},{"from":1622.9,"to":1625.72,"location":2,"content":"or you could have said it was Denver Broncos."},{"from":1625.72,"to":1627.34,"location":2,"content":"Um, and in general,"},{"from":1627.34,"to":1629.79,"location":2,"content":"even if you're answering with a span,"},{"from":1629.79,"to":1633.44,"location":2,"content":"there's gonna be variation as to how long a span you choose."},{"from":1633.44,"to":1636.04,"location":2,"content":"Um, so what they did, um,"},{"from":1636.04,"to":1638.68,"location":2,"content":"and so this was done with, on Mechanical Turk,"},{"from":1638.68,"to":1641.17,"location":2,"content":"gathering the data, or building questions,"},{"from":1641.17,"to":1645.79,"location":2,"content":"and getting answers, is that they got answers from three different people."},{"from":1645.79,"to":1646.9,"location":2,"content":"So here's this question,"},{"from":1646.9,"to":1649.81,"location":2,"content":"\"Along with non-governmental and non-state schools,"},{"from":1649.81,"to":1652.03,"location":2,"content":"what is another name for private schools?\""},{"from":1652.03,"to":1655.59,"location":2,"content":"And three human beings were asked the answer based on this passage."},{"from":1655.59,"to":1657.01,"location":2,"content":"And one said independent,"},{"from":1657.01,"to":1659.48,"location":2,"content":"and two said independent schools."},{"from":1659.48,"to":1662.95,"location":2,"content":"Um, this one, all three people gave the same answer."},{"from":1662.95,"to":1665.52,"location":2,"content":"This one, again, you get two different answers,"},{"from":1665.52,"to":1668.02,"location":2,"content":"so that they sample three answers."},{"from":1668.02,"to":1672.67,"location":2,"content":"And basically, then, you can be correct if you're going with any of the answers."},{"from":1672.67,"to":1679.33,"location":2,"content":"And so that sort of at least gives you a bit of robustness to variation in human answers."},{"from":1679.33,"to":1684.46,"location":2,"content":"Okay. And that starts me into the topic of evaluation."},{"from":1684.46,"to":1685.86,"location":2,"content":"Um, yeah."},{"from":1685.86,"to":1688.45,"location":2,"content":"And these slides here are entitled"},{"from":1688.45,"to":1692.14,"location":2,"content":"SQuAD version 1.1 because that means in five minutes time,"},{"from":1692.14,"to":1694.6,"location":2,"content":"I'm gonna tell you about SQuAD version 2,"},{"from":1694.6,"to":1696.64,"location":2,"content":"which adds a bit more stuff into it,"},{"from":1696.64,"to":1699.54,"location":2,"content":"but we'll just get 1.1 straight first."},{"from":1699.54,"to":1702.89,"location":2,"content":"All right. So there are three answers that col- were collected."},{"from":1702.89,"to":1705.28,"location":2,"content":"And so for evaluation metrics,"},{"from":1705.28,"to":1708.14,"location":2,"content":"they suggested two evaluation metrics."},{"from":1708.14,"to":1711.34,"location":2,"content":"The first one is exact match."},{"from":1711.34,"to":1714.25,"location":2,"content":"So you're going to return a span."},{"from":1714.25,"to":1717.97,"location":2,"content":"If the span is one of these three,"},{"from":1717.97,"to":1719.52,"location":2,"content":"you get one point,"},{"from":1719.52,"to":1720.82,"location":2,"content":"and if the scan,"},{"from":1720.82,"to":1722.98,"location":2,"content":"span is not one of these three,"},{"from":1722.98,"to":1725.18,"location":2,"content":"you get zero for that question."},{"from":1725.18,"to":1728.56,"location":2,"content":"And then your accuracy is just the percent correct,"},{"from":1728.56,"to":1730.35,"location":2,"content":"so that's extremely simple."},{"from":1730.35,"to":1732.91,"location":2,"content":"But the second metric, and actually,"},{"from":1732.91,"to":1735.98,"location":2,"content":"the one that was favored as the primary metric,"},{"from":1735.98,"to":1738.23,"location":2,"content":"was an F1 metric."},{"from":1738.23,"to":1741.5,"location":2,"content":"So what you do for this F1 metric"},{"from":1741.5,"to":1745.11,"location":2,"content":"is you're matching at the word level for the different answers."},{"from":1745.11,"to":1746.93,"location":2,"content":"So you've treat each,"},{"from":1746.93,"to":1752.28,"location":2,"content":"you treat the system span and each gold answer as a bag of words,"},{"from":1752.28,"to":1754.93,"location":2,"content":"and then you work out a precision, which is,"},{"from":1754.93,"to":1762.78,"location":2,"content":"um, the percent of words in the system's answer that are actually in a span,"},{"from":1762.78,"to":1765.77,"location":2,"content":"i- in a gold span, the recall,"},{"from":1765.77,"to":1771.62,"location":2,"content":"which is the percent of words in a gold span that are in the system's span."},{"from":1771.62,"to":1774.72,"location":2,"content":"And then you calculate the harmonic mean of those two numbers"},{"from":1774.72,"to":1777.76,"location":2,"content":"and the harmonic mean is sort of a very conservative average."},{"from":1777.76,"to":1780.46,"location":2,"content":"So it's close to the mean of those two numbers,"},{"from":1780.46,"to":1782.8,"location":2,"content":"and that gives you a score."},{"from":1782.8,"to":1787.38,"location":2,"content":"And what you then do is, for each question,"},{"from":1787.38,"to":1790.09,"location":2,"content":"you'd return, you say its score is"},{"from":1790.09,"to":1795.36,"location":2,"content":"the maximum F1 over the three different answers that were collected from human beings."},{"from":1795.36,"to":1798.85,"location":2,"content":"And then for the whole, um, dataset,"},{"from":1798.85,"to":1805.19,"location":2,"content":"you then average those F1 scores across questions and that's then your final F1 result."},{"from":1805.19,"to":1808.35,"location":2,"content":"So that's a more complicated thing to say."},{"from":1808.35,"to":1812.08,"location":2,"content":"Um, and we provide there sort of a val code,"},{"from":1812.08,"to":1813.97,"location":2,"content":"um, for you that does that."},{"from":1813.97,"to":1818.23,"location":2,"content":"Um, but it sort of seems that F1 is actually"},{"from":1818.23,"to":1824.2,"location":2,"content":"a more reliable and better measure because if you use exact match,"},{"from":1824.2,"to":1825.85,"location":2,"content":"you know, even though there's of,"},{"from":1825.85,"to":1829.53,"location":2,"content":"a bit of robustness that comes on three people's answers,"},{"from":1829.53,"to":1831.94,"location":2,"content":"three is not a very large sample,"},{"from":1831.94,"to":1834.31,"location":2,"content":"so there's sort of a bit of guessing as to whether you get"},{"from":1834.31,"to":1837.76,"location":2,"content":"exactly the same span some human being got,"},{"from":1837.76,"to":1841.18,"location":2,"content":"whereas you're sort of going to get a reasonable score"},{"from":1841.18,"to":1844.33,"location":2,"content":"in the F1 even if your boundaries are off by a little."},{"from":1844.33,"to":1847.35,"location":2,"content":"So the F1 metric sort of, um,"},{"from":1847.35,"to":1852.76,"location":2,"content":"is more reliable and avoids various kinds of artifacts as to how big"},{"from":1852.76,"to":1858.3,"location":2,"content":"or small an answer human beings tend to choose in some circumstances."},{"from":1858.3,"to":1860.65,"location":2,"content":"Um, and so that's sort of being used as"},{"from":1860.65,"to":1864.95,"location":2,"content":"the primary metric that people score people on in the leader boards."},{"from":1864.95,"to":1867.97,"location":2,"content":"Um, final detail, both metrics, um,"},{"from":1867.97,"to":1873.23,"location":2,"content":"ignore punctuation and the English articles a, an, the."},{"from":1873.23,"to":1877.39,"location":2,"content":"Okay. Um, so how did things work out?"},{"from":1877.39,"to":1881.17,"location":2,"content":"Um, so for SQuAD version 1.1, um."},{"from":1881.17,"to":1883.09,"location":2,"content":"A long time ago,"},{"from":1883.09,"to":1885.25,"location":2,"content":"at the end of 2016,"},{"from":1885.25,"to":1887.9,"location":2,"content":"um, this is how the leaderboard looked."},{"from":1887.9,"to":1890.68,"location":2,"content":"Um, this is the bottom of the leaderboard at this point in"},{"from":1890.68,"to":1894.14,"location":2,"content":"time because that allows me to show you a couple of things."},{"from":1894.14,"to":1896.89,"location":2,"content":"So down at the bottom of the leaderboard, um,"},{"from":1896.89,"to":1900.52,"location":2,"content":"so they tested how well human beings did, um,"},{"from":1900.52,"to":1902.83,"location":2,"content":"at answering these questions because you know,"},{"from":1902.83,"to":1905.88,"location":2,"content":"human beings aren't perfect at answering questions either."},{"from":1905.88,"to":1909.14,"location":2,"content":"Um, and so the human performance that they measured,"},{"from":1909.14,"to":1912.89,"location":2,"content":"um, had an F1 score of 91.2."},{"from":1912.89,"to":1916.29,"location":2,"content":"And I'll come back to that again in a minute."},{"from":1916.29,"to":1919.02,"location":2,"content":"Um, and so when they built the dataset,"},{"from":1919.02,"to":1924.79,"location":2,"content":"they built a logistic regression baseline which was sort of a conventional NLP system."},{"from":1924.79,"to":1929.32,"location":2,"content":"So, they dependency parsed the question and sentences of the answer."},{"from":1929.32,"to":1932.2,"location":2,"content":"They looked for dependency."},{"from":1932.2,"to":1934.78,"location":2,"content":"So dependency link matches,"},{"from":1934.78,"to":1938.35,"location":2,"content":"so a word at both ends with the dependency relation in"},{"from":1938.35,"to":1943.62,"location":2,"content":"between and count and matches of those and sort of pointing to a likely answer."},{"from":1943.62,"to":1949.8,"location":2,"content":"Um, so as sort of a fairly competently built traditional NLP system of it's"},{"from":1949.8,"to":1952.15,"location":2,"content":"not as complex as but it's sort of in"},{"from":1952.15,"to":1956.11,"location":2,"content":"the same vein of that early question answering system I mentioned."},{"from":1956.11,"to":1959.41,"location":2,"content":"And it got an F1 of about 51."},{"from":1959.41,"to":1961.22,"location":2,"content":"So not hopeless, um,"},{"from":1961.22,"to":1963.98,"location":2,"content":"but not that great compared to human beings."},{"from":1963.98,"to":1966.52,"location":2,"content":"And so, very shortly after that, um,"},{"from":1966.52,"to":1968.63,"location":2,"content":"people then started building"},{"from":1968.63,"to":1973.75,"location":2,"content":"neural network systems to try and do better at this task on this dataset."},{"from":1973.75,"to":1978.04,"location":2,"content":"And so, one of the first people to do this quite successfully,"},{"from":1978.04,"to":1981.58,"location":2,"content":"um, were these people from Singapore Management University,"},{"from":1981.58,"to":1985.15,"location":2,"content":"maybe not the first place you would have thought of but, um,"},{"from":1985.15,"to":1988.87,"location":2,"content":"they were really sort of the first people who showed that, yes,"},{"from":1988.87,"to":1992.32,"location":2,"content":"you could build an end-to-end trained neural network"},{"from":1992.32,"to":1995.32,"location":2,"content":"for this task and do rather better."},{"from":1995.32,"to":1998.93,"location":2,"content":"And so, they got up to 67 F1."},{"from":1998.93,"to":2002.1,"location":2,"content":"Um, and well, then they had a second system."},{"from":2002.1,"to":2004.99,"location":2,"content":"They got 70 and then things started,"},{"from":2004.99,"to":2008.14,"location":2,"content":"um, to, um, go on."},{"from":2008.14,"to":2009.67,"location":2,"content":"So that even by,"},{"from":2009.67,"to":2012.57,"location":2,"content":"um, the end of 2016,"},{"from":2012.57,"to":2018.18,"location":2,"content":"um, there started to be systems that really worked rather well on this task."},{"from":2018.18,"to":2020.98,"location":2,"content":"Um, so here, this time was the,"},{"from":2020.98,"to":2022.82,"location":2,"content":"um, top of the leaderboard."},{"from":2022.82,"to":2026.45,"location":2,"content":"So I'll talk later about this BiDAF system from, uh,"},{"from":2026.45,"to":2028.38,"location":2,"content":"the AI to,"},{"from":2028.38,"to":2031.8,"location":2,"content":"Allen Institute for Artificial Intelligence and the University of Washington."},{"from":2031.8,"to":2033.81,"location":2,"content":"So, it was getting to 77 as"},{"from":2033.81,"to":2037.77,"location":2,"content":"a single system that like in just about all machine learning,"},{"from":2037.77,"to":2040.26,"location":2,"content":"people pretty soon noticed that if you made"},{"from":2040.26,"to":2043.44,"location":2,"content":"an ensemble of identically structured systems,"},{"from":2043.44,"to":2046.83,"location":2,"content":"you could push the number higher and so if you ensemble those,"},{"from":2046.83,"to":2051.09,"location":2,"content":"you could then get another sort of whatever it is about four points"},{"from":2051.09,"to":2055.8,"location":2,"content":"and get up to 81, um, F1."},{"from":2055.8,"to":2062.45,"location":2,"content":"And so this was sort of around the situation when in the, uh, 2017, um,"},{"from":2062.45,"to":2070.44,"location":2,"content":"224N class, we first used SQuAD version one as jus- as a default final project."},{"from":2070.44,"to":2072.24,"location":2,"content":"And at that point, you know,"},{"from":2072.24,"to":2076.47,"location":2,"content":"actually the best students got almost to the top of this leaderboard."},{"from":2076.47,"to":2078.18,"location":2,"content":"So our best, um,"},{"from":2078.18,"to":2084.24,"location":2,"content":"CS224N Final Project in winter 2017 made it into,"},{"from":2084.24,"to":2087.69,"location":2,"content":"um, the equivalent of fourth place on this leaderboard,"},{"from":2087.69,"to":2091.08,"location":2,"content":"um, with 77.5 as their score."},{"from":2091.08,"to":2092.79,"location":2,"content":"So that was really rather cool."},{"from":2092.79,"to":2096.11,"location":2,"content":"Um, but that's a couple of years ago and since then,"},{"from":2096.11,"to":2098.1,"location":2,"content":"people have started building, um,"},{"from":2098.1,"to":2102.78,"location":2,"content":"bigger and bigger and more and more complex, um, systems."},{"from":2102.78,"to":2106.14,"location":2,"content":"And, um, so essentially,"},{"from":2106.14,"to":2110.79,"location":2,"content":"you could sort of say that SQuAD version one is basically solved."},{"from":2110.79,"to":2113.97,"location":2,"content":"So the very best systems are now getting"},{"from":2113.97,"to":2118.47,"location":2,"content":"F1 scores that are in the low 90s and in particular,"},{"from":2118.47,"to":2122.91,"location":2,"content":"you can see that the best couple of, um,"},{"from":2122.91,"to":2125.89,"location":2,"content":"systems have higher F1s and"},{"from":2125.89,"to":2131.25,"location":2,"content":"well higher exact matches than what was measured for human beings."},{"from":2131.25,"to":2134.14,"location":2,"content":"Uh, but like a lot of the claims of"},{"from":2134.14,"to":2137.31,"location":2,"content":"deep learning being better and performing from human being,"},{"from":2137.31,"to":2141,"location":2,"content":"than human beings, there's sort of some asterisks you can put after that."},{"from":2141,"to":2143.52,"location":2,"content":"I mean, in particular for this dataset,"},{"from":2143.52,"to":2148.13,"location":2,"content":"the way they measured human performance was a little bit"},{"from":2148.13,"to":2153.87,"location":2,"content":"unfair because they only actually collected three human beings' answers."},{"from":2153.87,"to":2158.34,"location":2,"content":"So, to judge, um, the human performance,"},{"from":2158.34,"to":2165.78,"location":2,"content":"the hu- those hu- each of those humans was being scored versus only two other humans."},{"from":2165.78,"to":2168.78,"location":2,"content":"And so, that means you only had two chances to match instead of three."},{"from":2168.78,"to":2173.82,"location":2,"content":"So, there's actually sort of a systematic underscoring of the human performance."},{"from":2173.82,"to":2177.74,"location":2,"content":"But whatever, systems got very good at doing this."},{"from":2177.74,"to":2180.96,"location":2,"content":"Um, so the next step, um,"},{"from":2180.96,"to":2182.52,"location":2,"content":"was then to introduce, uh,"},{"from":2182.52,"to":2185.45,"location":2,"content":"the SQuAD vers- version 2 task."},{"from":2185.45,"to":2189.99,"location":2,"content":"And so many people felt that a defect of SQuAD version"},{"from":2189.99,"to":2194.99,"location":2,"content":"1 was that in all cases, questions had answers."},{"from":2194.99,"to":2200.45,"location":2,"content":"So, that you just had to find the answer in the paragraph,"},{"from":2200.45,"to":2204.12,"location":2,"content":"um, and so that's sort of turned into a kind of a ranking task."},{"from":2204.12,"to":2208.36,"location":2,"content":"You just had to work out what seems the most likely answer."},{"from":2208.36,"to":2210.5,"location":2,"content":"I'll return that without really having"},{"from":2210.5,"to":2213.91,"location":2,"content":"any idea whether it was an answer to the question or not."},{"from":2213.91,"to":2216.53,"location":2,"content":"And so, for SQuAD version two,"},{"from":2216.53,"to":2218.79,"location":2,"content":"for the dev and test sets,"},{"from":2218.79,"to":2221.76,"location":2,"content":"half of the questions have answers and half of"},{"from":2221.76,"to":2224.95,"location":2,"content":"the questions just don't have an answer in the passage,"},{"from":2224.95,"to":2228.01,"location":2,"content":"um, it's slightly different distribution, the training data."},{"from":2228.01,"to":2232.78,"location":2,"content":"Um, and the way it works for scoring is the sort of, like,"},{"from":2232.78,"to":2238.92,"location":2,"content":"the no answer kind of counts as like one word as a sort of a special token."},{"from":2238.92,"to":2243.69,"location":2,"content":"So, if it's, if it should be a no answer and you say no answer,"},{"from":2243.69,"to":2248.58,"location":2,"content":"you get a score of one on the either exact match or the F-measure."},{"from":2248.58,"to":2250.56,"location":2,"content":"And if you don't do that,"},{"from":2250.56,"to":2252.21,"location":2,"content":"you get a score of zero."},{"from":2252.21,"to":2258.69,"location":2,"content":"Um, and so, the simplest way of approaching SQuAD 2.0 would be to say, well,"},{"from":2258.69,"to":2262.27,"location":2,"content":"rather than just always returning the best match in my system,"},{"from":2262.27,"to":2267.07,"location":2,"content":"I'll use some kind of threshold and only if the score is above a threshold,"},{"from":2267.07,"to":2268.78,"location":2,"content":"our counters and answer."},{"from":2268.78,"to":2271.05,"location":2,"content":"You could do more sophisticated things."},{"from":2271.05,"to":2274.08,"location":2,"content":"So another area that we've worked on quite a bit at Stanford is"},{"from":2274.08,"to":2278.52,"location":2,"content":"this natural language inference task that I'll talk about later in the course."},{"from":2278.52,"to":2282.84,"location":2,"content":"Um, but that's really about saying whether one piece of,"},{"from":2282.84,"to":2285.63,"location":2,"content":"um, text is the conclusion of another,"},{"from":2285.63,"to":2286.89,"location":2,"content":"um, piece of text."},{"from":2286.89,"to":2290.67,"location":2,"content":"And so that's sort of a way that you can try and see whether, uh,"},{"from":2290.67,"to":2297.12,"location":2,"content":"a piece of text actually gives you a justification and answer to what the question was."},{"from":2297.12,"to":2301.53,"location":2,"content":"But at any rate, this trying to decide whether"},{"from":2301.53,"to":2307.01,"location":2,"content":"you've actually got an answer or not is a quite difficult problem in many cases."},{"from":2307.01,"to":2311.88,"location":2,"content":"So here's an example from SQuAD, um, 2.0."},{"from":2311.88,"to":2315.12,"location":2,"content":"So Genghis Khan united the Mongol and Turkic tribes of"},{"from":2315.12,"to":2318.86,"location":2,"content":"the steppes and became Great Khan in 1206."},{"from":2318.86,"to":2322.29,"location":2,"content":"He and his successors expanded the Mongol Empire across Asia,"},{"from":2322.29,"to":2323.94,"location":2,"content":"blah, blah, blah, blah."},{"from":2323.94,"to":2325.64,"location":2,"content":"And the question is,"},{"from":2325.64,"to":2328.26,"location":2,"content":"when did Genghis Khan kill Great Khan?"},{"from":2328.26,"to":2330.48,"location":2,"content":"And the answer to that is,"},{"from":2330.48,"to":2333.53,"location":2,"content":"you know, uh, there isn't an answer because actually,"},{"from":2333.53,"to":2339.15,"location":2,"content":"Genghis Khan was a person named Great Khan and he didn't kill a Great Khan."},{"from":2339.15,"to":2341.84,"location":2,"content":"It's just not a question with an answer."},{"from":2341.84,"to":2347.99,"location":2,"content":"Um, but it's precisely what happens with systems is, you know,"},{"from":2347.99,"to":2351.64,"location":2,"content":"even though these systems get high scores in terms of points,"},{"from":2351.64,"to":2355.98,"location":2,"content":"they don't actually understand human language that well."},{"from":2355.98,"to":2357.61,"location":2,"content":"So they look at something that says,"},{"from":2357.61,"to":2360.86,"location":2,"content":"when did Genghis Khan kill Great Khan?"},{"from":2360.86,"to":2363.93,"location":2,"content":"Well, this is something that's looking for a date and there are"},{"from":2363.93,"to":2367.74,"location":2,"content":"some obvious dates in this passage there's 1206, 1234,"},{"from":2367.74,"to":2371.84,"location":2,"content":"1251 and well, there's kill,"},{"from":2371.84,"to":2376.56,"location":2,"content":"and kill looks a little bit similar to destroyed."},{"from":2376.56,"to":2378.64,"location":2,"content":"I can see the word destroyed."},{"from":2378.64,"to":2381.34,"location":2,"content":"So that probably kind of matches."},{"from":2381.34,"to":2383.4,"location":2,"content":"And then we're talking about, um,"},{"from":2383.4,"to":2385.56,"location":2,"content":"Genghis Khan and there,"},{"from":2385.56,"to":2388.39,"location":2,"content":"I can see Genghis and Khan in this passage."},{"from":2388.39,"to":2390.96,"location":2,"content":"And so it sort of puts that together and says"},{"from":2390.96,"to":2395.18,"location":2,"content":"1234 is the answer when that isn't the answer at all."},{"from":2395.18,"to":2399.87,"location":2,"content":"And that's actually kind of pretty typical of the behavior of these systems."},{"from":2399.87,"to":2403.56,"location":2,"content":"And so that, on the one hand, they work great."},{"from":2403.56,"to":2406.16,"location":2,"content":"On the other hand, they don't actually understand that much,"},{"from":2406.16,"to":2410.03,"location":2,"content":"and effectively asking whether there's,"},{"from":2410.03,"to":2414.93,"location":2,"content":"this question is actually answered in the passage is a way of"},{"from":2414.93,"to":2417.36,"location":2,"content":"revealing the extent to which these models"},{"from":2417.36,"to":2420.95,"location":2,"content":"do or don't understand what's actually going on."},{"from":2420.95,"to":2423.91,"location":2,"content":"Okay. So, at the time, um,"},{"from":2423.91,"to":2427.09,"location":2,"content":"they built SQuAD version 2.0."},{"from":2427.09,"to":2428.84,"location":2,"content":"They took some of, um,"},{"from":2428.84,"to":2432.09,"location":2,"content":"the existing SQuAD version one's systems,"},{"from":2432.09,"to":2436.72,"location":2,"content":"and, um, modified them in a very simple way."},{"from":2436.72,"to":2439.28,"location":2,"content":"I put in a threshold, um,"},{"from":2439.28,"to":2443.18,"location":2,"content":"score as to how good the final match was deemed to be,"},{"from":2443.18,"to":2447.64,"location":2,"content":"and said, Well, how well do you do on SQuAD 2.0?"},{"from":2447.64,"to":2450.82,"location":2,"content":"And the kind of systems that we saw doing well before,"},{"from":2450.82,"to":2452.37,"location":2,"content":"now didn't do that well,"},{"from":2452.37,"to":2458.82,"location":2,"content":"so something like the BiDAF system that we mentioned before was now scoring about 62 F1,"},{"from":2458.82,"to":2461.37,"location":2,"content":"so that that was sort of hugely lowering"},{"from":2461.37,"to":2465.21,"location":2,"content":"its performance and reflecting the limits of understanding."},{"from":2465.21,"to":2469.65,"location":2,"content":"Um, but it turned out actually that this problem didn't prove to"},{"from":2469.65,"to":2474.24,"location":2,"content":"be q- quite as difficult as the dataset authors,"},{"from":2474.24,"to":2476.82,"location":2,"content":"um, maybe thought either."},{"from":2476.82,"to":2479.78,"location":2,"content":"Um, because it turns out that um,"},{"from":2479.78,"to":2483.38,"location":2,"content":"here we are now in February 2019,"},{"from":2483.38,"to":2486.28,"location":2,"content":"and if you look at the top of the leaderboard,"},{"from":2486.28,"to":2489.47,"location":2,"content":"we're kind of getting close again to the point"},{"from":2489.47,"to":2492.78,"location":2,"content":"where the best systems are almost as good as human beings."},{"from":2492.78,"to":2499.08,"location":2,"content":"So, um, the current top rate system there you can see is getting 87.6 F1,"},{"from":2499.08,"to":2503.22,"location":2,"content":"which is less than two points behind where the human beings are."},{"from":2503.22,"to":2507.51,"location":2,"content":"Um, the SQuAD version 2 they also co- corrected the,"},{"from":2507.51,"to":2509.4,"location":2,"content":"um, scoring of human beings,"},{"from":2509.4,"to":2512.8,"location":2,"content":"so it's more of a fair evaluation this time, um,"},{"from":2512.8,"to":2514.92,"location":2,"content":"so there's still a bit of a gap but, you know,"},{"from":2514.92,"to":2518.01,"location":2,"content":"the systems are actually doing, um, really well."},{"from":2518.01,"to":2521.04,"location":2,"content":"And the interesting thing there is,"},{"from":2521.04,"to":2524.63,"location":2,"content":"you know, on the one hand these systems are impressively good."},{"from":2524.63,"to":2526.89,"location":2,"content":"Um, you can go on the SQuAD website and look"},{"from":2526.89,"to":2529.28,"location":2,"content":"at the output of several of the good systems,"},{"from":2529.28,"to":2532.34,"location":2,"content":"and you can see that there are just a ton of things that they get right."},{"from":2532.34,"to":2534.33,"location":2,"content":"They're absolutely not bad systems."},{"from":2534.33,"to":2538.98,"location":2,"content":"You have to be a good system to be getting five out of six of the questions right."},{"from":2538.98,"to":2541.86,"location":2,"content":"Um, but, you know, on the other hand they still"},{"from":2541.86,"to":2545.13,"location":2,"content":"make quite elementary Natural Language Understanding Errors."},{"from":2545.13,"to":2548.3,"location":2,"content":"And so here's an example of one of those."},{"from":2548.3,"to":2549.72,"location":2,"content":"Okay, so this one,"},{"from":2549.72,"to":2552.54,"location":2,"content":"the Yuan dynasty is considered both a successor to"},{"from":2552.54,"to":2556.16,"location":2,"content":"the Mongol Empire and an imperial Chinese dynasty."},{"from":2556.16,"to":2558.84,"location":2,"content":"It was the khanate ruled by the successors of"},{"from":2558.84,"to":2562.66,"location":2,"content":"Mongke Khan after the division of the Mongol Empire."},{"from":2562.66,"to":2566.73,"location":2,"content":"In official Chinese histories the Yuan dynasty bore the Mandate of Heaven,"},{"from":2566.73,"to":2570.48,"location":2,"content":"following the Song dynasty and preceding the Ming dynasty."},{"from":2570.48,"to":2572.66,"location":2,"content":"Okay. And then the question is,"},{"from":2572.66,"to":2575.76,"location":2,"content":"what dynasty came before the Yuan?"},{"from":2575.76,"to":2578.49,"location":2,"content":"And that's a pretty easy question,"},{"from":2578.49,"to":2579.99,"location":2,"content":"I'd hope, for a human being."},{"from":2579.99,"to":2582.83,"location":2,"content":"Everyone can answer that question?"},{"from":2582.83,"to":2588.48,"location":2,"content":"Okay, um, yeah, so it says in official Chinese histories Yuan Dynast- uh,"},{"from":2588.48,"to":2589.92,"location":2,"content":"sorry the next sentence."},{"from":2589.92,"to":2592.56,"location":2,"content":"Um, yeah followed- right the Yuan Dynasty following"},{"from":2592.56,"to":2595.24,"location":2,"content":"the Song dynasty and preceding the Ming dynasty."},{"from":2595.24,"to":2597.55,"location":2,"content":"But, you know actually um,"},{"from":2597.55,"to":2600.96,"location":2,"content":"this sort of the leading um,"},{"from":2600.96,"to":2605.31,"location":2,"content":"Google BERT model says that it was the Ming dynasty that came before"},{"from":2605.31,"to":2609.45,"location":2,"content":"the Yuan Dynasty which you know is sort of elementarily"},{"from":2609.45,"to":2613.32,"location":2,"content":"wrong that reveals some of the same kind of it's"},{"from":2613.32,"to":2618.24,"location":2,"content":"not really understanding everything but it's doing a sort of a matching problem still."},{"from":2618.24,"to":2625.62,"location":2,"content":"Okay. So, this SQuAD dataset has been useful and good."},{"from":2625.62,"to":2628.86,"location":2,"content":"It still has some major limitations and I just thought I'd"},{"from":2628.86,"to":2632.37,"location":2,"content":"mentioned what a few of those are so you're aware of some of the issues."},{"from":2632.37,"to":2634.95,"location":2,"content":"So one of them I've already mentioned, right,"},{"from":2634.95,"to":2640.74,"location":2,"content":"that you're in this space where all answers are a span from the passage."},{"from":2640.74,"to":2643.89,"location":2,"content":"And that just limits the kind of questions you can"},{"from":2643.89,"to":2647.03,"location":2,"content":"ask and the kind of difficult situations there can be."},{"from":2647.03,"to":2650.37,"location":2,"content":"So, there can't be yes-no questions counting"},{"from":2650.37,"to":2655.78,"location":2,"content":"questions or even any of the sort of more difficult implicit questions."},{"from":2655.78,"to":2661.18,"location":2,"content":"So, if you think back to when you were in middle school and did reading comprehension,"},{"from":2661.18,"to":2663.82,"location":2,"content":"I mean, it wasn't typically um,"},{"from":2663.82,"to":2667.44,"location":2,"content":"the case um, that you're being asked"},{"from":2667.44,"to":2671.4,"location":2,"content":"questions that were just stated explicitly in the text of,"},{"from":2671.4,"to":2674.88,"location":2,"content":"you know, Sue is visiting her mother in Miami."},{"from":2674.88,"to":2676.34,"location":2,"content":"And the question was,"},{"from":2676.34,"to":2678.32,"location":2,"content":"who was visiting in Miami?"},{"from":2678.32,"to":2683.73,"location":2,"content":"That wasn't the kind of questions you were asked you were normally asked questions um,"},{"from":2683.73,"to":2686.31,"location":2,"content":"like um, you know,"},{"from":2686.31,"to":2692.51,"location":2,"content":"um, Sue is going to a job interview this morning,"},{"from":2692.51,"to":2696.36,"location":2,"content":"um, it's a really important job interview for her future."},{"from":2696.36,"to":2699.43,"location":2,"content":"At breakfast she um,"},{"from":2699.43,"to":2703.39,"location":2,"content":"starts buttering both sides of her piece of toast um,"},{"from":2703.39,"to":2706.41,"location":2,"content":"and you are asked a question like, um,"},{"from":2706.41,"to":2711.32,"location":2,"content":"why um, is Sue buttering both sides of her piece of toast?"},{"from":2711.32,"to":2713.42,"location":2,"content":"And you're meant to be able to answer,"},{"from":2713.42,"to":2717.68,"location":2,"content":"\"She's distracted by her important job interview coming up later in the day.\""},{"from":2717.68,"to":2720.99,"location":2,"content":"Which isn't the- something that you can answer um,"},{"from":2720.99,"to":2723.51,"location":2,"content":"by just picking out a sub span."},{"from":2723.51,"to":2731.05,"location":2,"content":"Um, a second problem which is sort of actually a bigger problem is um,"},{"from":2731.05,"to":2735.64,"location":2,"content":"the way SQuAD was constructed for ease"},{"from":2735.64,"to":2741.97,"location":2,"content":"and not to be too expensive and various other reasons was um,"},{"from":2741.97,"to":2746.24,"location":2,"content":"paragraphs of Wikipedia were selected and then,"},{"from":2746.24,"to":2748.68,"location":2,"content":"Mechanical Turkers were hired to say,"},{"from":2748.68,"to":2751.22,"location":2,"content":"\"Come up with some questions um,"},{"from":2751.22,"to":2756.21,"location":2,"content":"that can be answered by this this passage in version 1.1.\""},{"from":2756.21,"to":2759.32,"location":2,"content":"And then in version two they were said- told,"},{"from":2759.32,"to":2763.17,"location":2,"content":"\"Also come up with some questions that"},{"from":2763.17,"to":2767.39,"location":2,"content":"look like they're related to this passage but aren't actually answered in the passage.\""},{"from":2767.39,"to":2770.07,"location":2,"content":"But, in all cases people were coming up with"},{"from":2770.07,"to":2774.87,"location":2,"content":"the questions staring at the passage and if you do that,"},{"from":2774.87,"to":2778.26,"location":2,"content":"it means that your questions are strongly"},{"from":2778.26,"to":2781.91,"location":2,"content":"overlapping with the passage both in terms of the,"},{"from":2781.91,"to":2786.63,"location":2,"content":"the words that are used and even the syntactic structures that are"},{"from":2786.63,"to":2791.52,"location":2,"content":"used for your questions tending to match the syntactic structures of the passage."},{"from":2791.52,"to":2797.09,"location":2,"content":"And so that makes question answering um, naturally easy."},{"from":2797.09,"to":2799.13,"location":2,"content":"What happens in the real world,"},{"from":2799.13,"to":2802.26,"location":2,"content":"is this human beings think up questions and"},{"from":2802.26,"to":2806.01,"location":2,"content":"type something into a search engine and the way"},{"from":2806.01,"to":2809.36,"location":2,"content":"that they type it in is completely distinct"},{"from":2809.36,"to":2813.07,"location":2,"content":"from the way something might be worded on a website."},{"from":2813.07,"to":2816.6,"location":2,"content":"So that they might be saying something like,"},{"from":2816.6,"to":2822.72,"location":2,"content":"you know, \"In what year did the price of hard disks drop below a dollar a megabyte?\""},{"from":2822.72,"to":2827.22,"location":2,"content":"Um, and the webpage will say something like"},{"from":2827.22,"to":2832.05,"location":2,"content":"the cost of hard disks has being dropping for many years um,"},{"from":2832.05,"to":2838.47,"location":2,"content":"in I know whenever it was 2004 prices eventually crossed um,"},{"from":2838.47,"to":2840.87,"location":2,"content":"the dollar megabyte barrier or something like that."},{"from":2840.87,"to":2844.78,"location":2,"content":"But there's a quite different discussion of the ideas."},{"from":2844.78,"to":2848.22,"location":2,"content":"And that kinda matching is much harder and that's one of"},{"from":2848.22,"to":2852.27,"location":2,"content":"the things that people have done other datasets have tried to do differently."},{"from":2852.27,"to":2855.96,"location":2,"content":"Um, another limitation is that these questions and"},{"from":2855.96,"to":2860.36,"location":2,"content":"answers are very much, find the sentence that's addressing the fact,"},{"from":2860.36,"to":2862.55,"location":2,"content":"match your question to the sentence,"},{"from":2862.55,"to":2865.08,"location":2,"content":"return the right thing,"},{"from":2865.08,"to":2869.4,"location":2,"content":"that there's nothing sort of more difficult than involves multi sentence,"},{"from":2869.4,"to":2873.21,"location":2,"content":"combine facts together styles of inferencing,"},{"from":2873.21,"to":2877.05,"location":2,"content":"that the limits of cross sentence stuff there is pretty much limited to"},{"from":2877.05,"to":2881.3,"location":2,"content":"resolving co-reference which is something we'll talk about later in the class,"},{"from":2881.3,"to":2884.31,"location":2,"content":"that means that you see a he or she or an it,"},{"from":2884.31,"to":2889.13,"location":2,"content":"and you can work out who that refers to earlier in the, this course."},{"from":2889.13,"to":2892.59,"location":2,"content":"Um, nevertheless, despite all those disadvantages,"},{"from":2892.59,"to":2895.23,"location":2,"content":"it sort of proved that SQuAD was, you know,"},{"from":2895.23,"to":2900.18,"location":2,"content":"well-targeted in terms of its level of difficulty, well-structured,"},{"from":2900.18,"to":2902.91,"location":2,"content":"clean dataset, and it's just been"},{"from":2902.91,"to":2907.14,"location":2,"content":"sort of everybody's favorite for a question answering dataset."},{"from":2907.14,"to":2910.08,"location":2,"content":"It also seems to have proved that actually for"},{"from":2910.08,"to":2913.53,"location":2,"content":"people who work in industry and want to build a question answering system,"},{"from":2913.53,"to":2916.01,"location":2,"content":"starting off by training a model in SQuAD,"},{"from":2916.01,"to":2919.23,"location":2,"content":"actually turns out to work pretty well it turns out."},{"from":2919.23,"to":2921.42,"location":2,"content":"I mean, it's not everything you want to do."},{"from":2921.42,"to":2926.25,"location":2,"content":"You definitely wanna have relevant in domain data and be using that as well,"},{"from":2926.25,"to":2930.45,"location":2,"content":"but you know, it turns out that it seems to actually be a quite useful starting point."},{"from":2930.45,"to":2935.86,"location":2,"content":"Okay. So, what I wanted to show you now was a- is a concrete,"},{"from":2935.86,"to":2940.71,"location":2,"content":"simple, neural question answering system."},{"from":2940.71,"to":2948.3,"location":2,"content":"Um, and this is the model that was built by here and I guess she was"},{"from":2948.3,"to":2955.86,"location":2,"content":"sort of an Abby predecessor since she was the preceding head TA for CS 224N."},{"from":2955.86,"to":2958.65,"location":2,"content":"Um, so this system,"},{"from":2958.65,"to":2961.83,"location":2,"content":"um, Stanford Attentive Reader it kind of gets called now."},{"from":2961.83,"to":2964.57,"location":2,"content":"I mean, this is sort of essentially"},{"from":2964.57,"to":2969.99,"location":2,"content":"the simplest neural question answering system that works pretty well."},{"from":2969.99,"to":2972.78,"location":2,"content":"So, it's not a bad thing to have in mind as"},{"from":2972.78,"to":2976.32,"location":2,"content":"a baseline and it's not the current state of the art by any means."},{"from":2976.32,"to":2980.79,"location":2,"content":"But you know, if you're sort of wondering what's the simplest thing that I can build"},{"from":2980.79,"to":2985.22,"location":2,"content":"that basically works as a question answering system decently,"},{"from":2985.22,"to":2987.32,"location":2,"content":"this is basically it."},{"from":2987.32,"to":2990.39,"location":2,"content":"Um, okay. So how does this work?"},{"from":2990.39,"to":2992.59,"location":2,"content":"So the way it works is like this."},{"from":2992.59,"to":2993.93,"location":2,"content":"So, first of all,"},{"from":2993.93,"to":2998.2,"location":2,"content":"we have a question which team won Super Bowl 50?"},{"from":2998.2,"to":3004.18,"location":2,"content":"And what we're gonna wanna do is build a representation of a question as a vector."},{"from":3004.18,"to":3006.92,"location":2,"content":"And the way we can do that is like this,"},{"from":3006.92,"to":3009.03,"location":2,"content":"for each word in the question,"},{"from":3009.03,"to":3010.84,"location":2,"content":"we look up a word embedding."},{"from":3010.84,"to":3015.44,"location":2,"content":"So, in particular it used GloVe- GloVe 300 dimensional word embeddings."},{"from":3015.44,"to":3019.24,"location":2,"content":"Um, we then run an LSTM"},{"from":3019.24,"to":3023.33,"location":2,"content":"forward through the question and then kind of like Abby talked about,"},{"from":3023.33,"to":3025.3,"location":2,"content":"we actually make it a bi-LSTM."},{"from":3025.3,"to":3029.03,"location":2,"content":"So, we run a second LSTM backwards through the question."},{"from":3029.03,"to":3034.88,"location":2,"content":"And so then, we grab the end state of both LSTMs"},{"from":3034.88,"to":3040.76,"location":2,"content":"and we simply concatenate them together into a vector of dimension 2D if,"},{"from":3040.76,"to":3043.73,"location":2,"content":"if our hidden states of the LSTM are dimension"},{"from":3043.73,"to":3048.43,"location":2,"content":"d and we say that is the representation of the question."},{"from":3048.43,"to":3051.24,"location":2,"content":"Okay. So, once we have that,"},{"from":3051.24,"to":3054.23,"location":2,"content":"we then start looking at the passage."},{"from":3054.23,"to":3057.64,"location":2,"content":"And so, for the start of dealing with the passage,"},{"from":3057.64,"to":3059.18,"location":2,"content":"we do the same thing."},{"from":3059.18,"to":3063.11,"location":2,"content":"We, um, look up a word vector for every word in"},{"from":3063.11,"to":3067.34,"location":2,"content":"the passage and we run a bidirectional LSTM,"},{"from":3067.34,"to":3072.2,"location":2,"content":"now being represented a bit more compactly um, across the passage."},{"from":3072.2,"to":3075.71,"location":2,"content":"But then we have to do a little bit more work because we actually"},{"from":3075.71,"to":3079.04,"location":2,"content":"have to find the answer in the passage."},{"from":3079.04,"to":3081.68,"location":2,"content":"And so what we're gonna do is use"},{"from":3081.68,"to":3088.18,"location":2,"content":"the question representation to sort of work out where the answer is using attention."},{"from":3088.18,"to":3091.8,"location":2,"content":"So this is a different use of attention to machine translation."},{"from":3091.8,"to":3095.11,"location":2,"content":"That kind of attention equations are still exactly the same."},{"from":3095.11,"to":3099.17,"location":2,"content":"But we've now got this sort of one question vector that we gonna be trying to"},{"from":3099.17,"to":3103.39,"location":2,"content":"match against to return the answer."},{"from":3103.39,"to":3107.15,"location":2,"content":"So, what we do is we, um,"},{"from":3107.15,"to":3111.13,"location":2,"content":"work out an attention score between"},{"from":3111.13,"to":3117.57,"location":2,"content":"each word's bi-LSTM representation and the question."},{"from":3117.57,"to":3122.93,"location":2,"content":"And so the way that's being done is we're using this bi-linear attention,"},{"from":3122.93,"to":3127.37,"location":2,"content":"um, that um, Abby briefly discussed and we'll see more of today."},{"from":3127.37,"to":3129.14,"location":2,"content":"We've got the question vector,"},{"from":3129.14,"to":3132.53,"location":2,"content":"the vector for a particular position in the passage"},{"from":3132.53,"to":3135.77,"location":2,"content":"to the two concatenated LSTM hidden states."},{"from":3135.77,"to":3137.93,"location":2,"content":"So they're the same dimensionality."},{"from":3137.93,"to":3141.02,"location":2,"content":"We have this intervening learn W matrix."},{"from":3141.02,"to":3143.36,"location":2,"content":"So, we work out that quantity,"},{"from":3143.36,"to":3145.11,"location":2,"content":"um, for each position,"},{"from":3145.11,"to":3147.89,"location":2,"content":"and then we put that through a softmax which will give us"},{"from":3147.89,"to":3152.18,"location":2,"content":"probabilities over the different words in the passage."},{"from":3152.18,"to":3154.22,"location":2,"content":"Um, and those give us,"},{"from":3154.22,"to":3156.66,"location":2,"content":"um, our attention weights."},{"from":3156.66,"to":3159.35,"location":2,"content":"And so at that point we have attention weights,"},{"from":3159.35,"to":3162.14,"location":2,"content":"um, for different positions, um,"},{"from":3162.14,"to":3165.41,"location":2,"content":"in the passage and we just declare that,"},{"from":3165.41,"to":3167.03,"location":2,"content":"um, that is where,"},{"from":3167.03,"to":3169.61,"location":2,"content":"um, the answer starts."},{"from":3169.61,"to":3173.27,"location":2,"content":"Um, and then to get the end of the answer,"},{"from":3173.27,"to":3181.31,"location":2,"content":"we simply do exactly the same thing again apart from we train a different W matrix here,"},{"from":3181.31,"to":3182.84,"location":2,"content":"and we have that,"},{"from":3182.84,"to":3184.94,"location":2,"content":"um, predict the end token."},{"from":3184.94,"to":3187.49,"location":2,"content":"And there's something a little bit subtle here."},{"from":3187.49,"to":3190.61,"location":2,"content":"Um, because, you know, really we're asking it to sort"},{"from":3190.61,"to":3193.68,"location":2,"content":"of predict the starts and the ends of the answer,"},{"from":3193.68,"to":3195.83,"location":2,"content":"and you might think, but wait a minute."},{"from":3195.83,"to":3199.59,"location":2,"content":"Surely, we need to look at the middle of the answer as well because maybe the,"},{"from":3199.59,"to":3203.41,"location":2,"content":"the most indicative words are actually going to be in the middle of the answer."},{"from":3203.41,"to":3207.71,"location":2,"content":"Um, but, you know, really really what we're,"},{"from":3207.71,"to":3212.96,"location":2,"content":"we're sort of implicitly telling the model of well,"},{"from":3212.96,"to":3217.05,"location":2,"content":"when you're training, if there's stuff in the middle that's useful,"},{"from":3217.05,"to":3222.44,"location":2,"content":"it's the bi-LSTM's job to push it to the extremes of the span,"},{"from":3222.44,"to":3227.07,"location":2,"content":"so that this simple bi-linear attention"},{"from":3227.07,"to":3231.95,"location":2,"content":"will be able to get a big score at the start of the span."},{"from":3231.95,"to":3235.04,"location":2,"content":"And you might also think there's something"},{"from":3235.04,"to":3238.37,"location":2,"content":"funny that this equation and that equation are exactly the same."},{"from":3238.37,"to":3242.27,"location":2,"content":"So, how come one of them is meant to know it's picking up beginning, um,"},{"from":3242.27,"to":3244.4,"location":2,"content":"and the other at the end?"},{"from":3244.4,"to":3247.47,"location":2,"content":"And again, you know, we're not doing anything to impose that."},{"from":3247.47,"to":3249.89,"location":2,"content":"We're just saying, neural network."},{"from":3249.89,"to":3251.91,"location":2,"content":"It is your job to learn."},{"from":3251.91,"to":3256.11,"location":2,"content":"Um, you have to learn a matrix here and a different one over there,"},{"from":3256.11,"to":3260.24,"location":2,"content":"so that one of them will pick out parts of the representation that"},{"from":3260.24,"to":3265.18,"location":2,"content":"indicate starts of answer spans and the other one ends of answer spans."},{"from":3265.18,"to":3268.16,"location":2,"content":"And so, that will then again pressure"},{"from":3268.16,"to":3271.55,"location":2,"content":"the neural network to sort of self organize itself in"},{"from":3271.55,"to":3274.1,"location":2,"content":"such a way that there'll be some parts of"},{"from":3274.1,"to":3278.27,"location":2,"content":"this hidden representation that will be good at learning starts of spans."},{"from":3278.27,"to":3280.01,"location":2,"content":"You know, maybe there'll be carried backwards by"},{"from":3280.01,"to":3283.52,"location":2,"content":"the backwards LSTM and and some parts of it will be good at"},{"from":3283.52,"to":3285.98,"location":2,"content":"learning where the spans end and then"},{"from":3285.98,"to":3290.61,"location":2,"content":"the W matrix will be able to pick out those parts of the representation."},{"from":3290.61,"to":3294.13,"location":2,"content":"Um, but yeah, uh,"},{"from":3294.13,"to":3298.36,"location":2,"content":"that's the system. Um, yeah."},{"from":3298.36,"to":3300.64,"location":2,"content":"So, um, so this is"},{"from":3300.64,"to":3305.98,"location":2,"content":"the basic Stanford Attentive Reader model and it's just no more complex than that."},{"from":3305.98,"to":3308.77,"location":2,"content":"Um, and the interesting thing is, you know,"},{"from":3308.77,"to":3314.24,"location":2,"content":"that very simple model actually works nicely well."},{"from":3314.24,"to":3316.36,"location":2,"content":"Um, so this is going back in time."},{"from":3316.36,"to":3323.23,"location":2,"content":"Again, this was the February 2017 SQuAD version 1 leaderboard."},{"from":3323.23,"to":3328.69,"location":2,"content":"Um, but at that time, that provide- like,"},{"from":3328.69,"to":3332.68,"location":2,"content":"it always in neural networks quite a bit of your success"},{"from":3332.68,"to":3339.28,"location":2,"content":"is training your hyperparameters and optimizing your model really well."},{"from":3339.28,"to":3341.26,"location":2,"content":"And some time, you know,"},{"from":3341.26,"to":3347.02,"location":2,"content":"it's been repeatedly proven in neural network land that often you can get"},{"from":3347.02,"to":3350.17,"location":2,"content":"much better scores than you would think from"},{"from":3350.17,"to":3353.84,"location":2,"content":"very simple models if you optimize them really well."},{"from":3353.84,"to":3357.28,"location":2,"content":"So there have been multiple cycles in sort of"},{"from":3357.28,"to":3359.83,"location":2,"content":"deep learning research where there"},{"from":3359.83,"to":3362.95,"location":2,"content":"was a paper that did something and then the next person says,"},{"from":3362.95,"to":3364.96,"location":2,"content":"\"Here's a more- more- more complex model that"},{"from":3364.96,"to":3367.54,"location":2,"content":"works better,\" and then someone else published a paper saying,"},{"from":3367.54,"to":3369.64,"location":2,"content":"\"Here's an even more complex than that model that works"},{"from":3369.64,"to":3372.49,"location":2,"content":"better,\" and then someone points out, \"No."},{"from":3372.49,"to":3377.14,"location":2,"content":"If you go back to the first model and just really train its hyperparameters well,"},{"from":3377.14,"to":3379.38,"location":2,"content":"you can beat both of those two models.\""},{"from":3379.38,"to":3381.88,"location":2,"content":"And that was effectively the case about what"},{"from":3381.88,"to":3384.61,"location":2,"content":"was happening with the Stanford Attentive Reader."},{"from":3384.61,"to":3389.24,"location":2,"content":"That, you know, back in- back in February 2017,"},{"from":3389.24,"to":3392.92,"location":2,"content":"if you just train this model really well,"},{"from":3392.92,"to":3397.99,"location":2,"content":"it could actually outperform most of the early SQuAD systems."},{"from":3397.99,"to":3399.24,"location":2,"content":"I mean, in particular,"},{"from":3399.24,"to":3401.88,"location":2,"content":"it could outperform, um, the BiDAF,"},{"from":3401.88,"to":3406.39,"location":2,"content":"the version of BiDAF that was around in early 2017 and,"},{"from":3406.39,"to":3409.32,"location":2,"content":"you know, various of these other systems from other people."},{"from":3409.32,"to":3411.34,"location":2,"content":"But it was actually, at that time,"},{"from":3411.34,"to":3415.41,"location":2,"content":"it was pretty close to the best system that anyone had built."},{"from":3415.41,"to":3417.97,"location":2,"content":"Um, as I've already pointed out to you,"},{"from":3417.97,"to":3420.28,"location":2,"content":"um, the numbers have gone up a lot since then."},{"from":3420.28,"to":3422.5,"location":2,"content":"So I'm not claiming that, um,"},{"from":3422.5,"to":3428.78,"location":2,"content":"this system is still as good as the best systems that you can build. But there you go."},{"from":3428.78,"to":3433,"location":2,"content":"Um, so that's the simple system that already works pretty well,"},{"from":3433,"to":3435.07,"location":2,"content":"but of course you want this system to work better."},{"from":3435.07,"to":3439.69,"location":2,"content":"Um, and so Danqi did quite a bit of work on that."},{"from":3439.69,"to":3443.3,"location":2,"content":"And so here I'll just mention a few things for, um,"},{"from":3443.3,"to":3446.13,"location":2,"content":"Stanford Attentive Reader++ as to"},{"from":3446.13,"to":3449.64,"location":2,"content":"what kind of things can you do to make the model better."},{"from":3449.64,"to":3454.7,"location":2,"content":"And so here's a sort of a picture of, um,"},{"from":3454.7,"to":3457.96,"location":2,"content":"the sort of the improved system and we'll go through"},{"from":3457.96,"to":3461.29,"location":2,"content":"some of the differences and what makes it better."},{"from":3461.29,"to":3465.19,"location":2,"content":"Um, there's something I didn't have before that I should just mention, right?"},{"from":3465.19,"to":3470.22,"location":2,"content":"Sort of this whole model, all the parameters of this model are just trained end to end,"},{"from":3470.22,"to":3473.98,"location":2,"content":"where your training objective is simply, um,"},{"from":3473.98,"to":3476.38,"location":2,"content":"working out how accurately you're predicting"},{"from":3476.38,"to":3479.05,"location":2,"content":"the start position and how accurately you're predicting"},{"from":3479.05,"to":3482.68,"location":2,"content":"the end position so that the attention gives"},{"from":3482.68,"to":3486.51,"location":2,"content":"you a probability distribution over start positions and end positions."},{"from":3486.51,"to":3489.82,"location":2,"content":"So you're just being asked what probability estimate"},{"from":3489.82,"to":3493.33,"location":2,"content":"are you giving to the true start position and the true end position."},{"from":3493.33,"to":3495.25,"location":2,"content":"And to the extent that though,"},{"from":3495.25,"to":3497.29,"location":2,"content":"you know, those aren't one,"},{"from":3497.29,"to":3502.38,"location":2,"content":"you've then got loss that is then being sort of summed in terms of log probability."},{"from":3502.38,"to":3505.57,"location":2,"content":"Okay. So how is this model, um,"},{"from":3505.57,"to":3508.86,"location":2,"content":"more complex now than what I showed before?"},{"from":3508.86,"to":3511.95,"location":2,"content":"Essentially in two main ways."},{"from":3511.95,"to":3516.37,"location":2,"content":"So the first one is looking at the question,"},{"from":3516.37,"to":3520.07,"location":2,"content":"we still run the BiLSTM as before."},{"from":3520.07,"to":3524.53,"location":2,"content":"Um, but now what we're going to do is it's a little bit crude"},{"from":3524.53,"to":3528.85,"location":2,"content":"just to take the end states of the LSTM and concatenate them together."},{"from":3528.85,"to":3534.28,"location":2,"content":"It turns out that you can do better by making use of all states in an LSTM."},{"from":3534.28,"to":3537.88,"location":2,"content":"And this is true for most tasks where you"},{"from":3537.88,"to":3541.97,"location":2,"content":"want some kind of sentence representation from a sequence model."},{"from":3541.97,"to":3544.59,"location":2,"content":"It turns out you can generally gain by using"},{"from":3544.59,"to":3547.51,"location":2,"content":"all of them rather than just the endpoints or that."},{"from":3547.51,"to":3552.68,"location":2,"content":"Um, so but this is just an interesting general thing to know again because, you know,"},{"from":3552.68,"to":3558.41,"location":2,"content":"this is actually another variant of how that- how you can use attention."},{"from":3558.41,"to":3565.53,"location":2,"content":"There are, you know, a lot of sort of the last two years of neural NLP can be summed"},{"from":3565.53,"to":3569.23,"location":2,"content":"up as people have found a lot of clever ways to use"},{"from":3569.23,"to":3573.22,"location":2,"content":"attention and that's been pairing just about all the advances."},{"from":3573.22,"to":3581.89,"location":2,"content":"Um, so what we wanna do is we want to have attention over the positions in this LSTM."},{"from":3581.89,"to":3586.26,"location":2,"content":"But, you know, this- we're processing the query first."},{"from":3586.26,"to":3591.36,"location":2,"content":"So it sort of seems like we've got nothing to calculate attention with respect to."},{"from":3591.36,"to":3595.15,"location":2,"content":"So what we do is we just invent something."},{"from":3595.15,"to":3596.86,"location":2,"content":"So we just sort of invent."},{"from":3596.86,"to":3601.66,"location":2,"content":"Here is a vector and it's sometimes called a sentinel or some word like that,"},{"from":3601.66,"to":3603.85,"location":2,"content":"but, you know, we just in our PyTorch say,"},{"from":3603.85,"to":3605.18,"location":2,"content":"\"Here is a vector."},{"from":3605.18,"to":3607.69,"location":2,"content":"Um, we're going to calculate, um,"},{"from":3607.69,"to":3609.46,"location":2,"content":"we initialize it randomly,"},{"from":3609.46,"to":3613.49,"location":2,"content":"and we're gonna calculate attention with respect to that vector,"},{"from":3613.49,"to":3620.95,"location":2,"content":"and we're going to use those attention scores, um, to, um,"},{"from":3620.95,"to":3624.25,"location":2,"content":"work out where to pay attention, um,"},{"from":3624.25,"to":3630.63,"location":2,"content":"in this BiLSTM, and then we just sort of train that vector so it gets values."},{"from":3630.63,"to":3634.27,"location":2,"content":"And so then we end up with a weighted sum of the time"},{"from":3634.27,"to":3639.43,"location":2,"content":"steps of that LSTM that uh, then form the question representation."},{"from":3639.43,"to":3642.37,"location":2,"content":"Um, second change, uh,"},{"from":3642.37,"to":3645.4,"location":2,"content":"the pictures only show a shallow BiLSTM but, you know,"},{"from":3645.4,"to":3648.94,"location":2,"content":"it turns out you can do better if you have a deep BiLSTM and say"},{"from":3648.94,"to":3653.01,"location":2,"content":"use a three-layer deep BiLSTM rather than a single layer."},{"from":3653.01,"to":3656.2,"location":2,"content":"Okay. Then the other changes in"},{"from":3656.2,"to":3662.35,"location":2,"content":"the passage representations and this part arguably gets a little bit more hacky,"},{"from":3662.35,"to":3666.52,"location":2,"content":"um, but there are things that you can do that make the numbers go up, I guess."},{"from":3666.52,"to":3667.81,"location":2,"content":"Um, okay."},{"from":3667.81,"to":3673.84,"location":2,"content":"So- so firstly for the representation of words rather than only using"},{"from":3673.84,"to":3678.07,"location":2,"content":"the GloVe representation that the input vectors are"},{"from":3678.07,"to":3684.05,"location":2,"content":"expanded so that- so a named entity recognizer and a part of speech tagger is run."},{"from":3684.05,"to":3688.61,"location":2,"content":"And since those are sort of small sets of values,"},{"from":3688.61,"to":3693.91,"location":2,"content":"that the output of those is just one-hot encoded and concatenated onto"},{"from":3693.91,"to":3696.49,"location":2,"content":"the word vector, so it represents if it's"},{"from":3696.49,"to":3700.2,"location":2,"content":"a location or a person name and whether it's a noun or a verb."},{"from":3700.2,"to":3704.08,"location":2,"content":"Um, word frequency proves to be a bit useful."},{"from":3704.08,"to":3712.16,"location":2,"content":"So there's your concatenating on sort of a representation of the word frequency as,"},{"from":3712.16,"to":3717.37,"location":2,"content":"um, just sort of a float of the unigram probability."},{"from":3717.37,"to":3725.34,"location":2,"content":"Um, and then this part is kind of key to getting some further advances which is, well,"},{"from":3725.34,"to":3731.14,"location":2,"content":"it turns out that we can do a better job by doing some sort"},{"from":3731.14,"to":3736.95,"location":2,"content":"of better understanding of the matching between the question and the passage."},{"from":3736.95,"to":3740.17,"location":2,"content":"And, um, this feature seems like it's"},{"from":3740.17,"to":3743.82,"location":2,"content":"very simple but turns out to actually give you quite a lot of value."},{"from":3743.82,"to":3748.42,"location":2,"content":"So you're simply saying for each word in the question,"},{"from":3748.42,"to":3752.22,"location":2,"content":"uh, so for each word- well, I said that wrong."},{"from":3752.22,"to":3755.92,"location":2,"content":"For each word in the passage,"},{"from":3755.92,"to":3759.04,"location":2,"content":"you were just saying, \"Does this word appear in the question?\""},{"from":3759.04,"to":3762.16,"location":2,"content":"And if so you're setting a one bit into"},{"from":3762.16,"to":3766.11,"location":2,"content":"the input and that's done in three different ways: exact match,"},{"from":3766.11,"to":3768.58,"location":2,"content":"uncased match, and lemma match."},{"from":3768.58,"to":3771.66,"location":2,"content":"So that means something like drive and driving, um,"},{"from":3771.66,"to":3773.59,"location":2,"content":"will match, and just that sort of"},{"from":3773.59,"to":3776.76,"location":2,"content":"indicator of here's where in the passage that's in the question."},{"from":3776.76,"to":3779.23,"location":2,"content":"In theory, the system should be able to work that out"},{"from":3779.23,"to":3783.11,"location":2,"content":"anyway that explicitly indicate and it gives quite a bit of value."},{"from":3783.11,"to":3789.31,"location":2,"content":"And then this last one does a sort of a softer version of that where it's using word"},{"from":3789.31,"to":3792.55,"location":2,"content":"embedding similarities to sort of calculate"},{"from":3792.55,"to":3796.21,"location":2,"content":"a kind of similarity between questions and answers,"},{"from":3796.21,"to":3799.34,"location":2,"content":"and that's a slightly complex equation that you can look up."},{"from":3799.34,"to":3806.03,"location":2,"content":"But effectively, um, that you're getting the embedding of words and the question answers."},{"from":3806.03,"to":3810.09,"location":2,"content":"Each of those, you're running through a single hidden layer,"},{"from":3810.09,"to":3811.59,"location":2,"content":"neural network, you know,"},{"from":3811.59,"to":3815.24,"location":2,"content":"dot producting it, and then putting all that through a Softmax,"},{"from":3815.24,"to":3821.04,"location":2,"content":"and that kind of gives you a sort of word similarity score and that helps as well."},{"from":3821.04,"to":3826.51,"location":2,"content":"Okay. So here's the kind of just overall picture this gives you."},{"from":3826.51,"to":3829.43,"location":2,"content":"So if you remember, um, um,"},{"from":3829.43,"to":3832.54,"location":2,"content":"there was the sort of the classical NLP"},{"from":3832.54,"to":3835.82,"location":2,"content":"with logistic regression baseline, there's around 51."},{"from":3835.82,"to":3838.81,"location":2,"content":"So for sort of a fairly simple model,"},{"from":3838.81,"to":3840.97,"location":2,"content":"like the Stanford Attentive Reader,"},{"from":3840.97,"to":3843.76,"location":2,"content":"it gives you an enormous boost in performance, right?"},{"from":3843.76,"to":3847.76,"location":2,"content":"That's giving you close to 30 percent performance gain."},{"from":3847.76,"to":3850.18,"location":2,"content":"And then, you know, from there,"},{"from":3850.18,"to":3853.42,"location":2,"content":"people have kept on pushing up neural systems."},{"from":3853.42,"to":3857.41,"location":2,"content":"But, you know, so this gives you kind of in some sense three quarters of"},{"from":3857.41,"to":3862.53,"location":2,"content":"the value over the traditional NLP system and in the much more,"},{"from":3862.53,"to":3866.08,"location":2,"content":"um, complex, um, neural systems that come after it."},{"from":3866.08,"to":3867.14,"location":2,"content":"Um, yeah."},{"from":3867.14,"to":3868.55,"location":2,"content":"In terms of error reduction,"},{"from":3868.55,"to":3871.78,"location":2,"content":"they're huge but it's sort of more like they're giving you the sort of,"},{"from":3871.78,"to":3875.31,"location":2,"content":"um, 12 percent after that."},{"from":3875.31,"to":3883.03,"location":2,"content":"Why did these systems work such a ton better um, than traditional systems?"},{"from":3883.03,"to":3886.75,"location":2,"content":"And so we actually did some error analysis of this and, you know,"},{"from":3886.75,"to":3892.18,"location":2,"content":"it turns out that most of their gains is because they can just do"},{"from":3892.18,"to":3896.89,"location":2,"content":"better semantic matching of word similarities"},{"from":3896.89,"to":3902.08,"location":2,"content":"or rephrasings that are semantically related but don't use the same words."},{"from":3902.08,"to":3910.68,"location":2,"content":"So, to- to the extent that the question is where was Christopher Manning born?"},{"from":3910.68,"to":3915.59,"location":2,"content":"And the sentence says Christopher Manning was born in Australia,"},{"from":3915.59,"to":3918.79,"location":2,"content":"a traditional NLP system would get that right too."},{"from":3918.79,"to":3921.57,"location":2,"content":"But that to the extent that you being able to get it right,"},{"from":3921.57,"to":3923.98,"location":2,"content":"depends on being able to match,"},{"from":3923.98,"to":3929.57,"location":2,"content":"sort of looser semantic matches so that we understand the sort of um,"},{"from":3929.57,"to":3933.61,"location":2,"content":"you know, the place of birth has to be matching was born or something."},{"from":3933.61,"to":3937.75,"location":2,"content":"That's where the neural systems actually do work much much better."},{"from":3937.75,"to":3944.95,"location":2,"content":"Okay. So, that's not the end of the story on question-answering systems."},{"from":3944.95,"to":3948.4,"location":2,"content":"And I wanted to say just a little bit about um,"},{"from":3948.4,"to":3951.67,"location":2,"content":"more complex systems to give you some idea um,"},{"from":3951.67,"to":3953.72,"location":2,"content":"of what goes on after that."},{"from":3953.72,"to":3956.26,"location":2,"content":"Um, but before I go further into that,"},{"from":3956.26,"to":3959.98,"location":2,"content":"are there any questions on uh,"},{"from":3959.98,"to":3963.13,"location":2,"content":"up until now, Stanford Attentive Reader?"},{"from":3963.13,"to":3969.76,"location":2,"content":"[NOISE] Yeah."},{"from":3969.76,"to":3972.93,"location":2,"content":"I have a question about attention in general."},{"from":3972.93,"to":3978.55,"location":2,"content":"Every example we've seen has been just linear mapping with a weight matrix."},{"from":3978.55,"to":3983.7,"location":2,"content":"Has anybody tried to convert that to a deep neural network and see what happens?"},{"from":3983.7,"to":3986.34,"location":2,"content":"Um, so yes they have."},{"from":3986.34,"to":3990.04,"location":2,"content":"Well, at least a shallow neural network."},{"from":3990.04,"to":3993.01,"location":2,"content":"Um, I'll actually show an example of that in just a minute."},{"from":3993.01,"to":3995.8,"location":2,"content":"So maybe I will um, save it till then."},{"from":3995.8,"to":3998.3,"location":2,"content":"But yeah absolutely, um,"},{"from":3998.3,"to":4005.03,"location":2,"content":"yeah people have done that and that can be a good thing to um, play with."},{"from":4005.03,"to":4012.06,"location":2,"content":"Anything else? Okay. Um, okay."},{"from":4012.06,"to":4017.97,"location":2,"content":"So, this is a picture of the BiDAF system,"},{"from":4017.97,"to":4020.73,"location":2,"content":"so this is the one from AI2 UDub."},{"from":4020.73,"to":4023.49,"location":2,"content":"And the BiDAF system is very well known."},{"from":4023.49,"to":4026.88,"location":2,"content":"Um, it's another sort of classic version of"},{"from":4026.88,"to":4031.14,"location":2,"content":"question-answering system that lots of people have used and built off."},{"from":4031.14,"to":4034.26,"location":2,"content":"Um, and, you know,"},{"from":4034.26,"to":4040.26,"location":2,"content":"some of it isn't completely different to what we saw before but it has various additions."},{"from":4040.26,"to":4043.98,"location":2,"content":"So, there are word embeddings just like we had before,"},{"from":4043.98,"to":4048.22,"location":2,"content":"there's a biLSTM running just like what we had before,"},{"from":4048.22,"to":4051.43,"location":2,"content":"and that's being done for both the um,"},{"from":4051.43,"to":4053.86,"location":2,"content":"passage and the question."},{"from":4053.86,"to":4057.21,"location":2,"content":"Um, but there are some different things that are happening as well."},{"from":4057.21,"to":4060.51,"location":2,"content":"So one of them is rather than just having word embeddings,"},{"from":4060.51,"to":4065.09,"location":2,"content":"it also processes the questions and passages at the character level."},{"from":4065.09,"to":4068.73,"location":2,"content":"And that's something that we're going to talk about coming up ahead in the class."},{"from":4068.73,"to":4074.2,"location":2,"content":"There's been a lot of work at doing character level processing in recent neural NLP,"},{"from":4074.2,"to":4076.36,"location":2,"content":"but I don't want to talk about that now."},{"from":4076.36,"to":4080.46,"location":2,"content":"Um, the main technical innovation of the BiDAF model"},{"from":4080.46,"to":4086.18,"location":2,"content":"is this attention flow layout because that's in its name bidirectional attention flow."},{"from":4086.18,"to":4090.3,"location":2,"content":"And so, there was a model of attention flow where you have attention"},{"from":4090.3,"to":4094.74,"location":2,"content":"flowing in both directions between the query and the passage."},{"from":4094.74,"to":4098.98,"location":2,"content":"And that was their main innovation and it was quite useful in their model."},{"from":4098.98,"to":4100.57,"location":2,"content":"Um, but beyond that,"},{"from":4100.57,"to":4103.5,"location":2,"content":"there's you know, sort of more stuff to this model."},{"from":4103.5,"to":4107.32,"location":2,"content":"So after the attention flow layer there's again"},{"from":4107.32,"to":4111.68,"location":2,"content":"multiple layers of bidirectional LSTMs running."},{"from":4111.68,"to":4115.77,"location":2,"content":"And then on top of that their output layer is more"},{"from":4115.77,"to":4121.23,"location":2,"content":"complex than the sort of simple attention version that I showed previously."},{"from":4121.23,"to":4125.15,"location":2,"content":"So let's just look at that in a bit more detail."},{"from":4125.15,"to":4127.94,"location":2,"content":"Um so, for the attention flow layer."},{"from":4127.94,"to":4133.9,"location":2,"content":"So, the motivation here was in the Stanford Attentive Reader,"},{"from":4133.9,"to":4137.46,"location":2,"content":"we used attention to map from"},{"from":4137.46,"to":4143.18,"location":2,"content":"the representation of the question onto the words of the passage."},{"from":4143.18,"to":4149.32,"location":2,"content":"But, you know so as questions are whole mapping onto the words of the passage."},{"from":4149.32,"to":4151.95,"location":2,"content":"Where their idea was well,"},{"from":4151.95,"to":4158.76,"location":2,"content":"presumably you could do better by mapping in both directions at the word level."},{"from":4158.76,"to":4163.89,"location":2,"content":"So you should be sort of finding passage words that you can map onto question words,"},{"from":4163.89,"to":4166.6,"location":2,"content":"and question words that you can map onto passage words."},{"from":4166.6,"to":4169.97,"location":2,"content":"And if you do that in both directions with attention flowing,"},{"from":4169.97,"to":4174.31,"location":2,"content":"and then run another round of sequence models on top of that,"},{"from":4174.31,"to":4178.53,"location":2,"content":"that you'll just be able to do much better matching between the two of them."},{"from":4178.53,"to":4182.94,"location":2,"content":"And so the way they do that is, um,"},{"from":4182.94,"to":4186.6,"location":2,"content":"that they- they've got the bottom- so at"},{"from":4186.6,"to":4190.8,"location":2,"content":"the bottom layers they've sort of run these two LSTMs."},{"from":4190.8,"to":4197.48,"location":2,"content":"So they have representations in the LSTM for each word and um,"},{"from":4197.48,"to":4200.48,"location":2,"content":"word and passage position."},{"from":4200.48,"to":4204.44,"location":2,"content":"And at this point I have to put it in a slight apology because I just"},{"from":4204.44,"to":4208.76,"location":2,"content":"stole the equations and so the letters that are used change."},{"from":4208.76,"to":4212.85,"location":2,"content":"Sorry. But, so these are the um,"},{"from":4212.85,"to":4218.51,"location":2,"content":"question individual words and these are the passage individual words."},{"from":4218.51,"to":4223.48,"location":2,"content":"And so, what they're then wanting to do is to say for each passage word,"},{"from":4223.48,"to":4228.1,"location":2,"content":"and each question word, I want to work out a similarity score."},{"from":4228.1,"to":4234.57,"location":2,"content":"And the way they work out that similarity score is they build a big concatenated vector."},{"from":4234.57,"to":4240.36,"location":2,"content":"So there's the LSTM representation of the passage word, the question word,"},{"from":4240.36,"to":4245.07,"location":2,"content":"and then they throw in a third thing where they do a Hadamard product,"},{"from":4245.07,"to":4249.85,"location":2,"content":"so an element-wise product of the question word and the context word."},{"from":4249.85,"to":4253.59,"location":2,"content":"Um, you know, for a neural net purist, throwing in"},{"from":4253.59,"to":4257.58,"location":2,"content":"these kind of Hadamard products is a little bit of a cheat because"},{"from":4257.58,"to":4261.18,"location":2,"content":"you kind of would hope that a neural net might just learn that"},{"from":4261.18,"to":4265.64,"location":2,"content":"this relation between the passage and the question was useful to look at."},{"from":4265.64,"to":4268.38,"location":2,"content":"But you can find a lot of models that put in"},{"from":4268.38,"to":4271.92,"location":2,"content":"these kind of Hadamard product because it's sort of"},{"from":4271.92,"to":4278.41,"location":2,"content":"a very easy way of sort of having a model that knows that matching is a good idea."},{"from":4278.41,"to":4284.79,"location":2,"content":"Because essentially this is sort of looking for each question and passage word pair."},{"from":4284.79,"to":4288.81,"location":2,"content":"You know, do the vectors look similar in various dimensions?"},{"from":4288.81,"to":4292.97,"location":2,"content":"You can sort of access very well from looking at that Hadamard product."},{"from":4292.97,"to":4295.81,"location":2,"content":"So that- so you take that big vector,"},{"from":4295.81,"to":4300.77,"location":2,"content":"and you then dot-product that with a learned weight matrix,"},{"from":4300.77,"to":4303.39,"location":2,"content":"and that gives you a similarity score"},{"from":4303.39,"to":4307.05,"location":2,"content":"between each position in the question and the context."},{"from":4307.05,"to":4310.4,"location":2,"content":"And so then what you're gonna do is use that to"},{"from":4310.4,"to":4315.32,"location":2,"content":"define attentions that go in both directions. Um-"},{"from":4315.32,"to":4318.99,"location":2,"content":"So for the, um, context,"},{"from":4318.99,"to":4322.41,"location":2,"content":"the question attention, this one's completely straightforward."},{"from":4322.41,"to":4328.55,"location":2,"content":"So, you put these similarity scores through a soft-max."},{"from":4328.55,"to":4333.52,"location":2,"content":"So for each of the i positions in the passage or sort of,"},{"from":4333.52,"to":4337.3,"location":2,"content":"having a softmax which is giving you a probability distribution,"},{"from":4337.3,"to":4340.38,"location":2,"content":"over question words and then you're coming up with"},{"from":4340.38,"to":4346.75,"location":2,"content":"a new representation of the i-th position which is then the attention weighted,"},{"from":4346.75,"to":4351.35,"location":2,"content":"um, version, the attention weighted average of those question words."},{"from":4351.35,"to":4352.76,"location":2,"content":"Um, so you're sort of,"},{"from":4352.76,"to":4358.77,"location":2,"content":"having attention weighted view of the question mapped onto each position in the passage."},{"from":4358.77,"to":4363.86,"location":2,"content":"Um, you then want to do something in the reverse direction."},{"from":4363.86,"to":4369.81,"location":2,"content":"Um, but the one in the reverse direction is done subtly differently."},{"from":4369.81,"to":4373.32,"location":2,"content":"So you're again starting off, um,"},{"from":4373.32,"to":4380.69,"location":2,"content":"with the- the same similarity scores but this time they're sort of wanting to, sort of,"},{"from":4380.69,"to":4384.88,"location":2,"content":"really assign which position,"},{"from":4384.88,"to":4392.12,"location":2,"content":"in which position in the question is the one that's, sort of,"},{"from":4392.12,"to":4396.98,"location":2,"content":"aligning the most so that they're finding a max and so that they're finding"},{"from":4396.98,"to":4402.55,"location":2,"content":"which is the most aligned one and so then for each of,"},{"from":4402.55,"to":4404.93,"location":2,"content":"for each of the i's,"},{"from":4404.93,"to":4407.89,"location":2,"content":"they're finding the most aligned question word."},{"from":4407.89,"to":4413.67,"location":2,"content":"And so then they're doing a softmax over these m scores and then those are being"},{"from":4413.67,"to":4419.9,"location":2,"content":"used to form a new representation of the passage by,"},{"from":4419.9,"to":4423.11,"location":2,"content":"sort of, summing over these attention weights."},{"from":4423.11,"to":4427.31,"location":2,"content":"Okay. So you build these things up and this then"},{"from":4427.31,"to":4431.33,"location":2,"content":"gives you a new representation where you have,"},{"from":4431.33,"to":4437.09,"location":2,"content":"um, your original representations of the passage words."},{"from":4437.09,"to":4440.12,"location":2,"content":"You'd have a new representation that you've built from"},{"from":4440.12,"to":4442.59,"location":2,"content":"this bidirectional attention flow and you"},{"from":4442.59,"to":4445.31,"location":2,"content":"look at these sort of Hadamard products of them and"},{"from":4445.31,"to":4450.11,"location":2,"content":"that then gives you kind of the output of the BiDAF layer and that output of"},{"from":4450.11,"to":4452.99,"location":2,"content":"the BiDAF layer is then what's sort of being fed as"},{"from":4452.99,"to":4458.35,"location":2,"content":"the input into these nick- next sequence of LSTM layers."},{"from":4458.35,"to":4462.24,"location":2,"content":"Okay. Um, and so yeah,"},{"from":4462.24,"to":4464.34,"location":2,"content":"um, so then that's the modeling layer."},{"from":4464.34,"to":4469.09,"location":2,"content":"You have another two BiLSTM layers and so the way they do the,"},{"from":4469.09,"to":4472.4,"location":2,"content":"um, suspense selection is a bit more complex as well."},{"from":4472.4,"to":4475.62,"location":2,"content":"Um, so that they're then, um,"},{"from":4475.62,"to":4480.02,"location":2,"content":"sort of taking the output of the modeling layer and putting it through a sort of"},{"from":4480.02,"to":4485.91,"location":2,"content":"a dense feed-forward neural network layer and then softmaxing over that,"},{"from":4485.91,"to":4489.02,"location":2,"content":"um, and that's then getting a distribution of"},{"from":4489.02,"to":4493.43,"location":2,"content":"a start and you're running yet another LSTM kind of a distribution finish."},{"from":4493.43,"to":4498.02,"location":2,"content":"Um, yeah. So, that gives you some idea of a more complex model."},{"from":4498.02,"to":4501.73,"location":2,"content":"Um, you know, in some sense,"},{"from":4501.73,"to":4505.9,"location":2,"content":"um, the summary if you go further forward than here is that, sort of,"},{"from":4505.9,"to":4508.84,"location":2,"content":"most of the work in the last couple of years,"},{"from":4508.84,"to":4514.22,"location":2,"content":"people have been producing progressively more complex architectures with"},{"from":4514.22,"to":4519.71,"location":2,"content":"lots of variants of attention and effectively that has been giving good gains."},{"from":4519.71,"to":4523.01,"location":2,"content":"Um, I think I'll skip since time is running,"},{"from":4523.01,"to":4525.23,"location":2,"content":"out, showing you that one."},{"from":4525.23,"to":4528.98,"location":2,"content":"But, um, let me just mention this FusionNet model"},{"from":4528.98,"to":4532.5,"location":2,"content":"which was done by people at Microsoft because this relates to the answer,"},{"from":4532.5,"to":4535.15,"location":2,"content":"the attention question, right?"},{"from":4535.15,"to":4540.74,"location":2,"content":"So p- so people have definitely used different versions of attention, right?"},{"from":4540.74,"to":4544.88,"location":2,"content":"So that in some of the stuff that we've shown we tend to emphasize"},{"from":4544.88,"to":4549.34,"location":2,"content":"this bi-linear attention where you've got two vectors mediated by a matrix."},{"from":4549.34,"to":4551.82,"location":2,"content":"And I guess traditionally at Stanford NLP,"},{"from":4551.82,"to":4553.46,"location":2,"content":"we've liked this, um,"},{"from":4553.46,"to":4556.46,"location":2,"content":"version of attention since it seems to very directly learn"},{"from":4556.46,"to":4560.69,"location":2,"content":"a similarity but other people have used a little neural net."},{"from":4560.69,"to":4563,"location":2,"content":"So this is, sort of, a shallow neural net to"},{"from":4563,"to":4565.34,"location":2,"content":"work out attention scores and there's, sort of,"},{"from":4565.34,"to":4567.74,"location":2,"content":"no reason why you couldn't say, maybe it would be even better if I"},{"from":4567.74,"to":4570.71,"location":2,"content":"make that a deep neural net and add another layer."},{"from":4570.71,"to":4572.47,"location":2,"content":"Um, and some of, you know,"},{"from":4572.47,"to":4574.92,"location":2,"content":"to be perfectly honest, um,"},{"from":4574.92,"to":4578.43,"location":2,"content":"some of the results that have been done by people including Google"},{"from":4578.43,"to":4582.52,"location":2,"content":"argue that actually that NLP version of attention is better."},{"from":4582.52,"to":4585.7,"location":2,"content":"Um, so there's something to explore in that direction."},{"from":4585.7,"to":4591.64,"location":2,"content":"But actually, um, the people in FusionNet didn't head that direction because they said,"},{"from":4591.64,"to":4594.71,"location":2,"content":"\"Look, we want to use tons and tons of attention."},{"from":4594.71,"to":4597.74,"location":2,"content":"So we want an attention computation that's pretty"},{"from":4597.74,"to":4601.16,"location":2,"content":"efficient and so it's bad news if you have to"},{"from":4601.16,"to":4604.11,"location":2,"content":"be evaluating a little dense neural net at"},{"from":4604.11,"to":4607.88,"location":2,"content":"every position every time that you do attention.\""},{"from":4607.88,"to":4611.63,"location":2,"content":"So this bi-linear form is fairly appealing"},{"from":4611.63,"to":4615.66,"location":2,"content":"but they then did some playing with it so rather than having a W matrix"},{"from":4615.66,"to":4619.7,"location":2,"content":"you can reduce the rank and complexity of"},{"from":4619.7,"to":4626.14,"location":2,"content":"your W matrix by dividing it into the product of two lower rank matrices."},{"from":4626.14,"to":4628.98,"location":2,"content":"So you can have a U and a V matrix."},{"from":4628.98,"to":4632.69,"location":2,"content":"And if you make these rectangular matrices that are kind of skinny,"},{"from":4632.69,"to":4636.45,"location":2,"content":"you can then have a sort of a lower rank factorization and,"},{"from":4636.45,"to":4638.42,"location":2,"content":"that seems a good idea."},{"from":4638.42,"to":4639.68,"location":2,"content":"And then they thought well,"},{"from":4639.68,"to":4643.27,"location":2,"content":"maybe really you want your attention distribution to be symmetric."},{"from":4643.27,"to":4646.46,"location":2,"content":"So we can actually put in the middle here,"},{"from":4646.46,"to":4649.1,"location":2,"content":"we can have the U and the V, so to speak,"},{"from":4649.1,"to":4652.16,"location":2,"content":"be the same and just have a diagonal matrix in"},{"from":4652.16,"to":4655.56,"location":2,"content":"the middle and that might be a useful way to think of it."},{"from":4655.56,"to":4659.56,"location":2,"content":"And that all makes sense from linear algebra terms but then they thought,"},{"from":4659.56,"to":4663.06,"location":2,"content":"\"Oh, non-linearity is really good in deep learning."},{"from":4663.06,"to":4664.64,"location":2,"content":"So why don't we, sort of,"},{"from":4664.64,"to":4668.79,"location":2,"content":"stick the left and right half through a ReLU and maybe that will help."},{"from":4668.79,"to":4672.38,"location":2,"content":"[LAUGHTER] Which doesn't so much make sense in your linear algebra terms, um,"},{"from":4672.38,"to":4676.85,"location":2,"content":"but that's actually what they ended up using as their, um, attention forms."},{"from":4676.85,"to":4680.15,"location":2,"content":"There are lots of things you can play with when doing your final project."},{"from":4680.15,"to":4682.09,"location":2,"content":"Um, yeah."},{"from":4682.09,"to":4684.74,"location":2,"content":"And, but, you know, their argument is still, you know,"},{"from":4684.74,"to":4687.92,"location":2,"content":"that doing attention this way is actually much much"},{"from":4687.92,"to":4691.07,"location":2,"content":"cheaper and so they can use a lot of attention."},{"from":4691.07,"to":4696.64,"location":2,"content":"And so they build this very complex tons of attention model, um,"},{"from":4696.64,"to":4699.15,"location":2,"content":"which I'm not going to try and explain, um,"},{"from":4699.15,"to":4701.56,"location":2,"content":"all of now, um,"},{"from":4701.56,"to":4704.75,"location":2,"content":"but I will show you this picture."},{"from":4704.75,"to":4708.3,"location":2,"content":"Um, so a point that they make is that a lot of"},{"from":4708.3,"to":4712.34,"location":2,"content":"the different models that people have explored in different years you,"},{"from":4712.34,"to":4713.91,"location":2,"content":"that, you know, they're sort of,"},{"from":4713.91,"to":4716.31,"location":2,"content":"doing different kinds of attention."},{"from":4716.31,"to":4719.18,"location":2,"content":"That you could be doing attention right,"},{"from":4719.18,"to":4722.24,"location":2,"content":"lining up with the original LSTM,"},{"from":4722.24,"to":4726.34,"location":2,"content":"you could run both sides through some stuff and do attention,"},{"from":4726.34,"to":4729.74,"location":2,"content":"you can do self attention inside your layer that there are a lot of"},{"from":4729.74,"to":4733.3,"location":2,"content":"different attentions that different models have explored."},{"from":4733.3,"to":4735.71,"location":2,"content":"And essentially what they are wanting to say is,"},{"from":4735.71,"to":4739.98,"location":2,"content":"let's do all of those and let's make it deep and do it all"},{"from":4739.98,"to":4744.21,"location":2,"content":"five times and the numbers will go up. And to some extent the answer is,"},{"from":4744.21,"to":4749.4,"location":2,"content":"yeah they do and the model ends up scoring very well."},{"from":4749.4,"to":4755.59,"location":2,"content":"Okay, um, so the one last thing I just wanted to mention but not explain is,"},{"from":4755.59,"to":4758.45,"location":2,"content":"I mean in the last year there's then been"},{"from":4758.45,"to":4762.95,"location":2,"content":"a further revolution in how well people can do these tasks."},{"from":4762.95,"to":4769.8,"location":2,"content":"And so people have developed algorithms which produce contextual word representation."},{"from":4769.8,"to":4772.79,"location":2,"content":"So that means that rather than a traditional word vector,"},{"from":4772.79,"to":4776.66,"location":2,"content":"you have a representation for each word in a particular context."},{"from":4776.66,"to":4781.7,"location":2,"content":"So here's the word frog in this particular context and the way people build"},{"from":4781.7,"to":4784.49,"location":2,"content":"those representations is using something"},{"from":4784.49,"to":4787.58,"location":2,"content":"like a language modeling tasks like Abby talked about,"},{"from":4787.58,"to":4790.73,"location":2,"content":"of saying putting probabilities of words in"},{"from":4790.73,"to":4794.8,"location":2,"content":"context to learn a context-specific word representation."},{"from":4794.8,"to":4797.87,"location":2,"content":"And ELMo was the first well-known such model."},{"from":4797.87,"to":4800.41,"location":2,"content":"And then people from Google came up with BERT,"},{"from":4800.41,"to":4801.83,"location":2,"content":"which worked even better."},{"from":4801.83,"to":4806.49,"location":2,"content":"Um, and so BERT is really in some sense is"},{"from":4806.49,"to":4811.23,"location":2,"content":"super complex attention Architecture doing a language modeling like objective."},{"from":4811.23,"to":4813.68,"location":2,"content":"We're going to talk about these later, um,"},{"from":4813.68,"to":4816.58,"location":2,"content":"I'm not going to talk about them now, um,"},{"from":4816.58,"to":4822.26,"location":2,"content":"but if you look at the current SQuAD 2.0 Leaderboard,"},{"from":4822.26,"to":4824.09,"location":2,"content":"um, you will quickly,"},{"from":4824.09,"to":4828.48,"location":2,"content":"um - sorry that's- oh I put the wrong slide and that was the bottom of the leaderboard."},{"from":4828.48,"to":4830.27,"location":2,"content":"Oops, slipped at the last minute."},{"from":4830.27,"to":4834.78,"location":2,"content":"If you go back to my slide which had the top of the leaderboard, um,"},{"from":4834.78,"to":4838.81,"location":2,"content":"you will have noticed that the top of the leaderboard,"},{"from":4838.81,"to":4842.82,"location":2,"content":"every single one of the top systems uses BERT."},{"from":4842.82,"to":4845.24,"location":2,"content":"So that's something that you may want to"},{"from":4845.24,"to":4847.82,"location":2,"content":"consider but you may want to consider how you could"},{"from":4847.82,"to":4852.8,"location":2,"content":"use it as a sub-module which you could add other stuff too as many of these systems do."},{"from":4852.8,"to":4856.14,"location":2,"content":"Okay. Done for today."}]} \ No newline at end of file diff --git a/bcc-en/11.bcc b/bcc-en/11.bcc new file mode 100644 index 0000000000000000000000000000000000000000..c5ed4c00cf7727624eb5befbc69f729802597f1f --- /dev/null +++ b/bcc-en/11.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.28,"to":7.62,"location":2,"content":"The plan for today is what I am gonna talk about"},{"from":7.62,"to":10.71,"location":2,"content":"is the topic of convolutional neural networks."},{"from":10.71,"to":13.92,"location":2,"content":"So essentially, um, there's actually quite a lot of"},{"from":13.92,"to":17.7,"location":2,"content":"content in this lecture of different things that's good to know about,"},{"from":17.7,"to":20.76,"location":2,"content":"since essentially this is going to be learn about"},{"from":20.76,"to":24.84,"location":2,"content":"convolutional neural networks in one large bite for NLP."},{"from":24.84,"to":27.46,"location":2,"content":"So, um, bit on announcements,"},{"from":27.46,"to":30.95,"location":2,"content":"explain the general idea of convolutional neural networks,"},{"from":30.95,"to":33.27,"location":2,"content":"and then for quite a bit of it,"},{"from":33.27,"to":38.49,"location":2,"content":"I want to go through in sort of some detail to particular papers that made"},{"from":38.49,"to":40.95,"location":2,"content":"use of convolutional neural networks for"},{"from":40.95,"to":44.23,"location":2,"content":"text classification, sentence classification tasks."},{"from":44.23,"to":47.04,"location":2,"content":"Um, the first is a sort of a pretty simple,"},{"from":47.04,"to":50.37,"location":2,"content":"um, CNN that was done in 2014,"},{"from":50.37,"to":52.37,"location":2,"content":"and then the second one is a"},{"from":52.37,"to":58.44,"location":2,"content":"way more complex CNN that was done much more recently in 2017."},{"from":58.44,"to":61.27,"location":2,"content":"Okay. But first, a couple of announcements."},{"from":61.27,"to":66.36,"location":2,"content":"Um, firstly, the last reminder on the mid-quarter feedback survey."},{"from":66.36,"to":68.7,"location":2,"content":"So tons of you have done the- this already."},{"from":68.7,"to":70.47,"location":2,"content":"Thank you, thank you very much."},{"from":70.47,"to":74.34,"location":2,"content":"Um, but if you'd still be putting it off till the very last minute, um,"},{"from":74.34,"to":77.33,"location":2,"content":"tonight at midnight is your last chance, um,"},{"from":77.33,"to":80.05,"location":2,"content":"to fill in the mid-quarter survey to get your,"},{"from":80.05,"to":83.28,"location":2,"content":"um, to give us feedback and to get your half-a-point."},{"from":83.28,"to":86.99,"location":2,"content":"Um, okay. And then the other thing that you should be thinking about,"},{"from":86.99,"to":89.51,"location":2,"content":"and I know lots of you are thinking about"},{"from":89.51,"to":92.5,"location":2,"content":"since I spent three hours talking to people yesterday,"},{"from":92.5,"to":95.19,"location":2,"content":"um, is about final projects."},{"from":95.19,"to":99.06,"location":2,"content":"Um, and so make sure you've got some plans from that, um,"},{"from":99.06,"to":100.72,"location":2,"content":"in place for, um,"},{"from":100.72,"to":104.64,"location":2,"content":"04:00 p.m, uh, 04:30 p.m. Thursday."},{"from":104.64,"to":107.53,"location":2,"content":"I mean, in particular as we've discussed, um,"},{"from":107.53,"to":112.75,"location":2,"content":"your- part of what you're meant to do this year is to have found some research paper,"},{"from":112.75,"to":117.73,"location":2,"content":"have read it, and have a summary and thoughts as to how it can inform your work."},{"from":117.73,"to":121.55,"location":2,"content":"Um, and then just make sure you have in your calendars, um,"},{"from":121.55,"to":125.73,"location":2,"content":"the final project poster session for CS224n,"},{"from":125.73,"to":129.32,"location":2,"content":"which is gonna be in the evening of Wednesday March 20th,"},{"from":129.32,"to":132.64,"location":2,"content":"and we're holding it at the Alumni Center."},{"from":132.64,"to":139.94,"location":2,"content":"Okay. Um, one more sort of announcement or just general stuff to cogitate."},{"from":139.94,"to":143.06,"location":2,"content":"Um, so we're now officially in the second half of the class."},{"from":143.06,"to":144.54,"location":2,"content":"Congratulations."},{"from":144.54,"to":146.63,"location":2,"content":"Um, and, you know,"},{"from":146.63,"to":151.88,"location":2,"content":"there's sort of still a few things that we want to teach you that are sort of basic,"},{"from":151.88,"to":154.7,"location":2,"content":"and actually convolutional neural networks is one of them."},{"from":154.7,"to":159.95,"location":2,"content":"But, I mean, nevertheless in the second half of the class, I mean,"},{"from":159.95,"to":164.48,"location":2,"content":"things start to change and we're hoping to much more, um,"},{"from":164.48,"to":169.97,"location":2,"content":"prepare you for being real deep learning NLP researchers or practitioners."},{"from":169.97,"to":172.4,"location":2,"content":"And so what does that mean concretely?"},{"from":172.4,"to":175.75,"location":2,"content":"Well, the lectures start to be less"},{"from":175.75,"to":179.66,"location":2,"content":"giving every detail of how to build a very basic thing,"},{"from":179.66,"to":182.63,"location":2,"content":"and more giving you some ideas"},{"from":182.63,"to":185.88,"location":2,"content":"to sort of some of the work that's been done in different areas."},{"from":185.88,"to":188.51,"location":2,"content":"And so to the extent that there's something of interest or"},{"from":188.51,"to":191.38,"location":2,"content":"rele- relevant to a project or things like that."},{"from":191.38,"to":194.36,"location":2,"content":"Um, the hope is that while you can take some initiative to"},{"from":194.36,"to":197.91,"location":2,"content":"find out more about some of the things that are being talked about."},{"from":197.91,"to":202.1,"location":2,"content":"Um, also would really welcome any questions about things that people,"},{"from":202.1,"to":204.44,"location":2,"content":"um, would want to know more about."},{"from":204.44,"to":206.42,"location":2,"content":"And the other thing that you should know about"},{"from":206.42,"to":210.44,"location":2,"content":"deep learning is that once we get past the fundamentals,"},{"from":210.44,"to":213.35,"location":2,"content":"a lot of the stuff we teach just isn't"},{"from":213.35,"to":218.12,"location":2,"content":"really known science or things that people are sure of that,"},{"from":218.12,"to":221.87,"location":2,"content":"you know, most of what I'm teaching in the second half of the course is pretty"},{"from":221.87,"to":226.18,"location":2,"content":"much what people think is good practice in 2019."},{"from":226.18,"to":229.37,"location":2,"content":"But, you know, the fact of the matter is what people think is"},{"from":229.37,"to":233.39,"location":2,"content":"good practice in deep learning has been changing really rapidly."},{"from":233.39,"to":238.33,"location":2,"content":"So if you go back even two years or definitely if you go back four years, right?"},{"from":238.33,"to":241.64,"location":2,"content":"There's just a lot of different things that people used to believe,"},{"from":241.64,"to":244.85,"location":2,"content":"and now people have some different ideas as to what works best."},{"from":244.85,"to":249.53,"location":2,"content":"And it's perfectly clear that come 2021 or 2023,"},{"from":249.53,"to":252.35,"location":2,"content":"there will be some different ideas again as to what,"},{"from":252.35,"to":254.09,"location":2,"content":"um, people think is best."},{"from":254.09,"to":257.75,"location":2,"content":"So you sort of just have to accept that this is, um,"},{"from":257.75,"to":260.63,"location":2,"content":"a nascent rapidly emerging field"},{"from":260.63,"to":264.13,"location":2,"content":"and it's good to understand the fundamentals and how things fit together."},{"from":264.13,"to":267.74,"location":2,"content":"But after that, quite a bit of the knowledge is this is what people"},{"from":267.74,"to":271.28,"location":2,"content":"think is good at the moment and it keeps evolving over time."},{"from":271.28,"to":274.75,"location":2,"content":"And if you want to stay in the field, or doing things with deep learning,"},{"from":274.75,"to":277.5,"location":2,"content":"you kind of still have to keep up with how it changes."},{"from":277.5,"to":279.71,"location":2,"content":"It's called lifelong learning these days."},{"from":279.71,"to":281.81,"location":2,"content":"It's a very trendy concept."},{"from":281.81,"to":285.2,"location":2,"content":"Um, and so as well as the lectures,"},{"from":285.2,"to":289.74,"location":2,"content":"this is also true for the assignments."},{"from":289.74,"to":291.72,"location":2,"content":"Um, and, you know,"},{"from":291.72,"to":297.05,"location":2,"content":"we've been trying to make the assignments so that they started off very introductory,"},{"from":297.05,"to":301.34,"location":2,"content":"and gradually started to use less scaffolding,"},{"from":301.34,"to":303.39,"location":2,"content":"and we're going to hope to, um,"},{"from":303.39,"to":310.53,"location":2,"content":"continue that, um, with the sort of less hand holding in assignment five."},{"from":310.53,"to":313.91,"location":2,"content":"And, you know, I guess what we're hoping to do is prepare you"},{"from":313.91,"to":317.5,"location":2,"content":"both for the final project and for real life."},{"from":317.5,"to":321,"location":2,"content":"I guess I was making an analogy this morning,"},{"from":321,"to":325.37,"location":2,"content":"um, comparing this to the sort of intro CS sequence,"},{"from":325.37,"to":329.13,"location":2,"content":"so when there's CS106A and B that have tons of scaffolding,"},{"from":329.13,"to":331.02,"location":2,"content":"and then in CS107,"},{"from":331.02,"to":334.85,"location":2,"content":"you're meant to learn how to diagnose and solve problems"},{"from":334.85,"to":338.91,"location":2,"content":"for yourself in a debugger that is kind of the same,"},{"from":338.91,"to":341.01,"location":2,"content":"um, for neural networks that, you know,"},{"from":341.01,"to":343.77,"location":2,"content":"for the early assignments, um, you know,"},{"from":343.77,"to":346.61,"location":2,"content":"we've given you every bit of handholding here, all of"},{"from":346.61,"to":349.49,"location":2,"content":"these tests to make sure every little bit of it is okay,"},{"from":349.49,"to":351.81,"location":2,"content":"and here's exactly how to structure things."},{"from":351.81,"to":354.31,"location":2,"content":"But, you know, in the real world,"},{"from":354.31,"to":357.69,"location":2,"content":"um, you're only going to be able to build and use neural networks."},{"from":357.69,"to":360.26,"location":2,"content":"If you can figure out why they're not working"},{"from":360.26,"to":362.99,"location":2,"content":"and what you have to change to make them work."},{"from":362.99,"to":366.79,"location":2,"content":"And, you know, the truth is as I talked a bit about last week, you know,"},{"from":366.79,"to":371.21,"location":2,"content":"that's often well more than half of the job that it seems easy enough to stick down."},{"from":371.21,"to":374.27,"location":2,"content":"Here's my neural net and the pieces that make sense to me,"},{"from":374.27,"to":377.66,"location":2,"content":"and then you can spend the remaining 80 percent of the time"},{"from":377.66,"to":381.23,"location":2,"content":"scratching your head wondering why it doesn't actually work well,"},{"from":381.23,"to":384.33,"location":2,"content":"and how you could change it to make it to work well."},{"from":384.33,"to":389.81,"location":2,"content":"Um, so, um, I confess that debugging neural nets can often be hard, but, you know,"},{"from":389.81,"to":394.19,"location":2,"content":"the goal is that you should actually learn something about doing it,"},{"from":394.19,"to":398.6,"location":2,"content":"and that's kind of one of the learning goals of the course when it comes down to it."},{"from":398.6,"to":401.05,"location":2,"content":"Um, final little advertisement."},{"from":401.05,"to":403.37,"location":2,"content":"If you feel like you'd like to read a book,"},{"from":403.37,"to":405.15,"location":2,"content":"um, just out this week,"},{"from":405.15,"to":408.31,"location":2,"content":"there's a new book on natural language processing with PyTorch"},{"from":408.31,"to":411.57,"location":2,"content":"by Delip Rao and Brian McMahan."},{"from":411.57,"to":413.99,"location":2,"content":"Delip actually lives in San Francisco."},{"from":413.99,"to":416.66,"location":2,"content":"Um, so, um, if you want to,"},{"from":416.66,"to":418.31,"location":2,"content":"you can buy a copy of this, of course."},{"from":418.31,"to":420.23,"location":2,"content":"But if you don't want to, um,"},{"from":420.23,"to":423.23,"location":2,"content":"buy it and you feel like having a bit of a look through it, um,"},{"from":423.23,"to":429.11,"location":2,"content":"the Stanford library is actually has a license to the O'Reilly's Safari Books collection."},{"from":429.11,"to":434.94,"location":2,"content":"So you can start off at library.stanford.edu and read it for free."},{"from":434.94,"to":438.23,"location":2,"content":"There's one catch to this which is the library only has"},{"from":438.23,"to":441.71,"location":2,"content":"16 simultaneous licenses to Safari Books."},{"from":441.71,"to":445.45,"location":2,"content":"So if you'd also like your classmates to be able to read it for free,"},{"from":445.45,"to":449.94,"location":2,"content":"it really helps if you remember to log out of Safari Books Online,"},{"from":449.94,"to":452.27,"location":2,"content":"um, when you're done looking at it."},{"from":452.27,"to":454.79,"location":2,"content":"Um, yes, so this is sort of a,"},{"from":454.79,"to":456.42,"location":2,"content":"I mean, in some sense,"},{"from":456.42,"to":459.02,"location":2,"content":"I hope you will feel if you look at this book,"},{"from":459.02,"to":461.61,"location":2,"content":"\"Boy, I already know most of that stuff already."},{"from":461.61,"to":463.74,"location":2,"content":"It's not a super advanced book."},{"from":463.74,"to":469.78,"location":2,"content":"But it's a good well-written tutorial of how to do things with PyTorch and NLP.\""},{"from":469.78,"to":472.62,"location":2,"content":"If you don't feel like you know most of the stuff in this book,"},{"from":472.62,"to":476.25,"location":2,"content":"you can let me know but I will be a little sad."},{"from":476.25,"to":481.03,"location":2,"content":"Um, okay, um, yeah."},{"from":481.03,"to":483.76,"location":2,"content":"So, let, so starting into today."},{"from":483.76,"to":486.43,"location":2,"content":"Um, so, we spent a lot of time on"},{"from":486.43,"to":490.63,"location":2,"content":"recurrent neural networks and they are great for many things."},{"from":490.63,"to":495.67,"location":2,"content":"Um, but there's sort of some things that they're not so good at."},{"from":495.67,"to":501.28,"location":2,"content":"So, you know, we kind of might like to know about a phrase like my birth,"},{"from":501.28,"to":503.8,"location":2,"content":"or a bigger phrase like of my birth,"},{"from":503.8,"to":507.55,"location":2,"content":"and there's sort of no independent, um,"},{"from":507.55,"to":511.48,"location":2,"content":"representation of those spans in a recurrent neural network."},{"from":511.48,"to":515.37,"location":2,"content":"We kind of get sort of prefixes of a whole sentence."},{"from":515.37,"to":518.82,"location":2,"content":"And while we did, um, bidirectional, um,"},{"from":518.82,"to":522.1,"location":2,"content":"recurrent neural networks, and you could say, 'Well,"},{"from":522.1,"to":525.67,"location":2,"content":"wait a minute you could use it in both directions' and to some extent that's true."},{"from":525.67,"to":529.12,"location":2,"content":"We can get stuff from this direction and stuff from this direction,"},{"from":529.12,"to":531.26,"location":2,"content":"but we still kind of have sort of"},{"from":531.26,"to":534.73,"location":2,"content":"whole sequences that go to one end of the sentence or another."},{"from":534.73,"to":537.79,"location":2,"content":"We don't just have pieces of sentences."},{"from":537.79,"to":543.6,"location":2,"content":"And often, we'd like to sort of work out meanings of pieces of sentences,"},{"from":543.6,"to":546.28,"location":2,"content":"and so, we sort of have two problems here."},{"from":546.28,"to":549.84,"location":2,"content":"We only have sort of initial and final sub-sequences."},{"from":549.84,"to":554.23,"location":2,"content":"And also, if you look at these representations, like if you say,"},{"from":554.23,"to":558.82,"location":2,"content":"take this last state as the representation of the meaning of this text."},{"from":558.82,"to":560.08,"location":2,"content":"What you find out,"},{"from":560.08,"to":562.36,"location":2,"content":"is it's very dominated by the meaning of"},{"from":562.36,"to":567.64,"location":2,"content":"the most recent words and what they are trying to predict as to what comes after them,"},{"from":567.64,"to":570.09,"location":2,"content":"and that's part of the reason why I mentioned"},{"from":570.09,"to":573.28,"location":2,"content":"last time in the question answering, um, lecture,"},{"from":573.28,"to":577.06,"location":2,"content":"the idea that well you can do better by having a sentinel and training"},{"from":577.06,"to":581.75,"location":2,"content":"something that has attention over the whole, um, LSTM structure."},{"from":581.75,"to":584.56,"location":2,"content":"Okay. But today we're going to look at"},{"from":584.56,"to":588.57,"location":2,"content":"a different alternative which is convolutional neural nets,"},{"from":588.57,"to":593.49,"location":2,"content":"which are often abbreviated as either CNN's or ConvNets."},{"from":593.49,"to":597.38,"location":2,"content":"Um, and the idea of these is, well,"},{"from":597.38,"to":599.92,"location":2,"content":"look maybe we could just take"},{"from":599.92,"to":606.91,"location":2,"content":"every sub-sequence of a certain length and calculate a representation for it, um,"},{"from":606.91,"to":610.09,"location":2,"content":"so that, you know, if we have some piece of text like,"},{"from":610.09,"to":612.68,"location":2,"content":"tentative deal reached to keep government open,"},{"from":612.68,"to":614.32,"location":2,"content":"and we could sort of just say, well,"},{"from":614.32,"to":617.11,"location":2,"content":"let's just take all three words sequences,"},{"from":617.11,"to":619.76,"location":2,"content":"tentative deal reached, deal reached to,"},{"from":619.76,"to":621.38,"location":2,"content":"reached to keep et cetera,"},{"from":621.38,"to":626.47,"location":2,"content":"and we're going to calculate some kind of representation for each of those sequences."},{"from":626.47,"to":630.25,"location":2,"content":"So, this is an- isn't a strongly linguistic idea."},{"from":630.25,"to":633.43,"location":2,"content":"Right? We're not worrying about whether it's a coherent phrase,"},{"from":633.43,"to":636.31,"location":2,"content":"that's grammatical linguistically valid,"},{"from":636.31,"to":641.13,"location":2,"content":"cognitively plausible, we're just taking every sub-sequence of a certain length."},{"from":641.13,"to":645.37,"location":2,"content":"And then, once we've calculated representations of those,"},{"from":645.37,"to":648.02,"location":2,"content":"we're going to look at how to group them."},{"from":648.02,"to":655.9,"location":2,"content":"Okay. So, let's get into more detail as to what CNN's are and how they work."},{"from":655.9,"to":661.9,"location":2,"content":"Um, yeah, so, there's this general idea of a convolution which you may or may"},{"from":661.9,"to":667.86,"location":2,"content":"not have seen in some math or electrical engineering class."},{"from":667.86,"to":672.01,"location":2,"content":"And then, there's the particular version of convolutions,"},{"from":672.01,"to":675.31,"location":2,"content":"the discrete convolutions, which you can means that"},{"from":675.31,"to":678.91,"location":2,"content":"you can use the friendly summation symbol rather than an integral."},{"from":678.91,"to":680.9,"location":2,"content":"Um, and that's a,"},{"from":680.9,"to":682.48,"location":2,"content":"that's a discrete convolution."},{"from":682.48,"to":685.5,"location":2,"content":"I find that that notation as completely unhelpful."},{"from":685.5,"to":687.04,"location":2,"content":"So, I won't even try and explain it."},{"from":687.04,"to":688.69,"location":2,"content":"But I've got lots of examples,"},{"from":688.69,"to":694.08,"location":2,"content":"and convolutions are really easy for neural nets in terms of what they do for examples."},{"from":694.08,"to":698.59,"location":2,"content":"All right, so the classic case of where convolutional neural networks are used,"},{"from":698.59,"to":700.27,"location":2,"content":"is in vision applications."},{"from":700.27,"to":704.61,"location":2,"content":"So, if you do CS231N next quarter,"},{"from":704.61,"to":707.77,"location":2,"content":"essentially you know, the first four weeks is just all doing"},{"from":707.77,"to":711.72,"location":2,"content":"convolutional neural networks in all their variants and glory."},{"from":711.72,"to":715.54,"location":2,"content":"Um, and the sort of essential idea of, um,"},{"from":715.54,"to":717.89,"location":2,"content":"convolutions for a vision,"},{"from":717.89,"to":722.41,"location":2,"content":"is that you want to recognize things no matter where they appear in an image."},{"from":722.41,"to":725.62,"location":2,"content":"So, you have a sort of property of translation and variance,"},{"from":725.62,"to":728.23,"location":2,"content":"and the idea of a convolution as a way"},{"from":728.23,"to":730.81,"location":2,"content":"of finding something in different places in the image,"},{"from":730.81,"to":732.67,"location":2,"content":"regardless of where it appears."},{"from":732.67,"to":739.36,"location":2,"content":"Um, so this is the vision example which I stole from Andrew Ng's UFLDL website."},{"from":739.36,"to":741.92,"location":2,"content":"And so, what a convolution is,"},{"from":741.92,"to":744.13,"location":2,"content":"is it's here a patch,"},{"from":744.13,"to":746.82,"location":2,"content":"but you can think of it as just as a vector,"},{"from":746.82,"to":751.45,"location":2,"content":"and the patch has weights which are these little numbers in red,"},{"from":751.45,"to":753,"location":2,"content":"and what you're gonna do,"},{"from":753,"to":760.35,"location":2,"content":"is slide that patch over the image as this as this animation does."},{"from":760.35,"to":763.08,"location":2,"content":"Um, and so at each position,"},{"from":763.08,"to":767.9,"location":2,"content":"you're going to multiply each of the red numbers by the black number in that position,"},{"from":767.9,"to":770.08,"location":2,"content":"and then you're just going to sum them up."},{"from":770.08,"to":773.25,"location":2,"content":"So, that's what a discrete convolution does,"},{"from":773.25,"to":775.18,"location":2,"content":"which is what that notation at the top is saying,"},{"from":775.18,"to":778.5,"location":2,"content":"right? You're multiplying things together and then you're summing them up,"},{"from":778.5,"to":780.24,"location":2,"content":"and so you're doing this,"},{"from":780.24,"to":784.24,"location":2,"content":"and then you're filling in the pink with the products,"},{"from":784.24,"to":785.71,"location":2,"content":"um, the sum products."},{"from":785.71,"to":787.86,"location":2,"content":"So, it's sort of like, you're taking these sort of"},{"from":787.86,"to":792.4,"location":2,"content":"patch dot products and putting them into the pink matrix,"},{"from":792.4,"to":794.82,"location":2,"content":"and that's then your convolved feature."},{"from":794.82,"to":797.35,"location":2,"content":"So, that's a 2D convolution,"},{"from":797.35,"to":798.76,"location":2,"content":"which for the rest of today,"},{"from":798.76,"to":800.47,"location":2,"content":"we're not going to look at anymore."},{"from":800.47,"to":803.22,"location":2,"content":"So, this is all you're learning about vision."},{"from":803.22,"to":808.39,"location":2,"content":"Um, and so we're now going to go back and look at 1D convolutions,"},{"from":808.39,"to":813,"location":2,"content":"which is what people use when they're using convolutional neural networks for text."},{"from":813,"to":816.61,"location":2,"content":"So, the starting point of a convolutional neural network for text,"},{"from":816.61,"to":818.41,"location":2,"content":"is we have an input."},{"from":818.41,"to":822.19,"location":2,"content":"So, here's my sentence and for each word"},{"from":822.19,"to":825.97,"location":2,"content":"in the sentence I have here got a dense word vector,"},{"from":825.97,"to":831.33,"location":2,"content":"I made it a 4D, want to keep it small in my example but usually as you know, it's more."},{"from":831.33,"to":834.58,"location":2,"content":"So, our starting point is we have some input, you know,"},{"from":834.58,"to":838.06,"location":2,"content":"input could just be a one-hot encoding that's not forbidden here,"},{"from":838.06,"to":841.79,"location":2,"content":"but normally we'll have these kind of dense word vectors."},{"from":841.79,"to":846.31,"location":2,"content":"And so, then it's sort of the same as the 3D as the 2D one,"},{"from":846.31,"to":848.18,"location":2,"content":"apart from we've only got one dimension."},{"from":848.18,"to":850.51,"location":2,"content":"So, we have a filter."},{"from":850.51,"to":854.41,"location":2,"content":"Um, so here is our filter,"},{"from":854.41,"to":861.75,"location":2,"content":"and so our filter is gonna do three steps and time, three words."},{"from":861.75,"to":865.93,"location":2,"content":"And that's going to work across the dimensions."},{"from":865.93,"to":868.24,"location":2,"content":"So, these different dimensions in"},{"from":868.24,"to":872.5,"location":2,"content":"the convolutional neural network often get referred to as channels."},{"from":872.5,"to":875.66,"location":2,"content":"So, we're kind of working across the input channels,"},{"from":875.66,"to":877.99,"location":2,"content":"and so we have a patch like this."},{"from":877.99,"to":885.43,"location":2,"content":"And we're going to take this patch and put it on top of the first three words."},{"from":885.43,"to":887.98,"location":2,"content":"I don't have as good an animation as the previous slide."},{"from":887.98,"to":891.61,"location":2,"content":"Sorry. And we're going to work out the dot product,"},{"from":891.61,"to":896.41,"location":2,"content":"um, between those, and I did that at home by putting this into Excel."},{"from":896.41,"to":898.01,"location":2,"content":"And the answer [LAUGHTER] to that,"},{"from":898.01,"to":901.25,"location":2,"content":"is that the product is minus 1.0."},{"from":901.25,"to":905.5,"location":2,"content":"And then at that point, we slide our,"},{"from":905.5,"to":908.35,"location":2,"content":"We slide this, um,"},{"from":908.35,"to":911.41,"location":2,"content":"matrix which gets referred to as a kernel or"},{"from":911.41,"to":916.3,"location":2,"content":"a filter which is the patch that we're using for our convolutional neural network."},{"from":916.3,"to":921.52,"location":2,"content":"We slide it down one and do the dot product of those terms again."},{"from":921.52,"to":928.96,"location":2,"content":"And that comes out as minus a half and we keep on sliding that down and we get what,"},{"from":928.96,"to":933.1,"location":2,"content":"um, gets what's shown on the right as our output."},{"from":933.1,"to":934.26,"location":2,"content":"So at this point,"},{"from":934.26,"to":936.69,"location":2,"content":"we've just reduced the sentence,"},{"from":936.69,"to":939.11,"location":2,"content":"um, to a single vector."},{"from":939.11,"to":944.74,"location":2,"content":"Um, and that seems like we might want to do more than that."},{"from":944.74,"to":948.46,"location":2,"content":"Um, but the other thing that you will have noticed is that"},{"from":948.46,"to":952.5,"location":2,"content":"our sentence is sort of shrunk because before, you know,"},{"from":952.5,"to":957.71,"location":2,"content":"we had a seven word sentence but because I've just sort of slid this three word,"},{"from":957.71,"to":959.62,"location":2,"content":"um, kernel down here,"},{"from":959.62,"to":963.01,"location":2,"content":"I ended up with only five positions to put it in."},{"from":963.01,"to":965.83,"location":2,"content":"So it's become a five word thing."},{"from":965.83,"to":968.96,"location":2,"content":"Um, so to first of all address that problem,"},{"from":968.96,"to":974.03,"location":2,"content":"commonly when people do convolutional neural networks, they add padding."},{"from":974.03,"to":978.79,"location":2,"content":"Um, so what I can do is I can add zero padding at"},{"from":978.79,"to":985.8,"location":2,"content":"both ends and then sort of do the same trick and say run a convolution on that."},{"from":985.8,"to":991.36,"location":2,"content":"And now, I'll be able to put my size three filter in seven different places as I"},{"from":991.36,"to":997.84,"location":2,"content":"slide it down and so I'm getting out a vector that's the same length of my input."},{"from":997.84,"to":1000.65,"location":2,"content":"Um, that, you know, there are different way,"},{"from":1000.65,"to":1003.2,"location":2,"content":"so this is the most common way of doing things."},{"from":1003.2,"to":1006.76,"location":2,"content":"And it's kind of seems logical because it maintains size."},{"from":1006.76,"to":1010.46,"location":2,"content":"I mean, you know, there's always more than one way to do it."},{"from":1010.46,"to":1012.31,"location":2,"content":"Um, if you really wanted to,"},{"from":1012.31,"to":1014.39,"location":2,"content":"you, oops, I don't want you, yeah,"},{"from":1014.39,"to":1019.56,"location":2,"content":"there, oops, I made, uh,"},{"from":1019.56,"to":1025.86,"location":2,"content":"I made a slight mistake on my slide because this"},{"from":1025.86,"to":1028.4,"location":2,"content":"turns out which I was about to get to in a minute"},{"from":1028.4,"to":1032.79,"location":2,"content":"but I'll just explain this bit here anyway [LAUGHTER]."},{"from":1032.79,"to":1035.45,"location":2,"content":"Um, you know, if you wanted to,"},{"from":1035.45,"to":1039.74,"location":2,"content":"you could have two steps of padding on both ends here."},{"from":1039.74,"to":1044.29,"location":2,"content":"So that your first convolution we'll be looking at zero, zero,"},{"from":1044.29,"to":1050.59,"location":2,"content":"10 to the of and then the convolution would actually grow the size of your input."},{"from":1050.59,"to":1055.91,"location":2,"content":"Yeah. But, yes. So I mean,"},{"from":1055.91,"to":1058.57,"location":2,"content":"so what we've done so far,"},{"from":1058.57,"to":1061.38,"location":2,"content":"we've started with these word vectors which in"},{"from":1061.38,"to":1066.34,"location":2,"content":"convolutional neural networks terms were of length four."},{"from":1066.34,"to":1069.47,"location":2,"content":"So our kind of input had four channels."},{"from":1069.47,"to":1073.03,"location":2,"content":"But when we were back here, um,"},{"from":1073.03,"to":1076.52,"location":2,"content":"we were just producing from this, um,"},{"from":1076.52,"to":1079.69,"location":2,"content":"kernel, one column of output."},{"from":1079.69,"to":1082.56,"location":2,"content":"So our output has only a single channel."},{"from":1082.56,"to":1088.69,"location":2,"content":"So we've sort of shrunk things in the columns direction from four to one."},{"from":1088.69,"to":1091.49,"location":2,"content":"And that might seem bad."},{"from":1091.49,"to":1094.11,"location":2,"content":"And for many purposes, it is bad."},{"from":1094.11,"to":1096.71,"location":2,"content":"Um, and so, a lot of the time,"},{"from":1096.71,"to":1101.16,"location":2,"content":"what you want to do is to say,"},{"from":1101.16,"to":1105.33,"location":2,"content":"well, rather than have only one filter,"},{"from":1105.33,"to":1109.26,"location":2,"content":"instead of that, why don't I have several filters?"},{"from":1109.26,"to":1112.68,"location":2,"content":"So here I've got three different filters and each of"},{"from":1112.68,"to":1116.62,"location":2,"content":"these filters is just sort of the same size three,"},{"from":1116.62,"to":1121.83,"location":2,"content":"three the size, the kernel size times the input,"},{"from":1121.83,"to":1126.14,"location":2,"content":"number of channels for the matrix."},{"from":1126.14,"to":1129.55,"location":2,"content":"So I have three different filters and I'm going to run"},{"from":1129.55,"to":1133.38,"location":2,"content":"each one down the text and get a column here."},{"from":1133.38,"to":1136.51,"location":2,"content":"So now, I'm ending up with three columns of output."},{"from":1136.51,"to":1139.67,"location":2,"content":"And so I have this sort of a three channel output."},{"from":1139.67,"to":1144.94,"location":2,"content":"And the way to intuitively think of this is for these filters,"},{"from":1144.94,"to":1147.51,"location":2,"content":"well, you know, for what we do in neural networks,"},{"from":1147.51,"to":1151.04,"location":2,"content":"we're going to learn them by backpropagation like everything else."},{"from":1151.04,"to":1156.76,"location":2,"content":"But our hope is that these filters could somehow specialize in different things."},{"from":1156.76,"to":1160.48,"location":2,"content":"So maybe this filter could specialize on,"},{"from":1160.48,"to":1162.36,"location":2,"content":"is this language polite?"},{"from":1162.36,"to":1166.72,"location":2,"content":"And it will produce a high value whenever it sees polite words."},{"from":1166.72,"to":1169.85,"location":2,"content":"And maybe, um, this, um,"},{"from":1169.85,"to":1175.61,"location":2,"content":"filter could specialize on, I don't know,"},{"from":1175.61,"to":1178.8,"location":2,"content":"eating and it will have a high value whenever it sees words"},{"from":1178.8,"to":1182.43,"location":2,"content":"about food and you know this filter will do a third thing."},{"from":1182.43,"to":1189.23,"location":2,"content":"And so that's the sense in which people sometimes talk about, um, the, um,"},{"from":1189.23,"to":1193.08,"location":2,"content":"what you're getting is output of different features because your hope is that"},{"from":1193.08,"to":1197.52,"location":2,"content":"you'll kind of gain different latent features coming out of the text."},{"from":1197.52,"to":1202.56,"location":2,"content":"Okay. So that gives us a representation and that's sort of"},{"from":1202.56,"to":1207.54,"location":2,"content":"a useful sort of having found learn features in our text."},{"from":1207.54,"to":1211.29,"location":2,"content":"That quite often though, what we'll want to do is just"},{"from":1211.29,"to":1215.61,"location":2,"content":"summarize the text with re- with respect to those features."},{"from":1215.61,"to":1218.03,"location":2,"content":"So you might just have the question of, well,"},{"from":1218.03,"to":1220.05,"location":2,"content":"in this piece of text, um,"},{"from":1220.05,"to":1223.43,"location":2,"content":"is it polite and does it talk about food?"},{"from":1223.43,"to":1226.56,"location":2,"content":"So another operation that we'll quite often"},{"from":1226.56,"to":1230.41,"location":2,"content":"do is wanna summarize the output of a convolutional network."},{"from":1230.41,"to":1232.75,"location":2,"content":"And the simplest way to do that,"},{"from":1232.75,"to":1235.11,"location":2,"content":"is for 1D convolutions,"},{"from":1235.11,"to":1237.63,"location":2,"content":"is called max pooling over time."},{"from":1237.63,"to":1240.08,"location":2,"content":"So if we max pool over time,"},{"from":1240.08,"to":1243.93,"location":2,"content":"that each of the channels or otherwise known as features,"},{"from":1243.93,"to":1253.87,"location":2,"content":"we're just simply going to look down and see what is its maximum value, 0.3, 1.6, 1.4."},{"from":1253.87,"to":1255.78,"location":2,"content":"Um, and so, you know,"},{"from":1255.78,"to":1258.73,"location":2,"content":"if I use my story about the first two, um,"},{"from":1258.73,"to":1260.7,"location":2,"content":"filters, it's sort of saying, well,"},{"from":1260.7,"to":1264.6,"location":2,"content":"it's not very polite text but it's really about food, right?"},{"from":1264.6,"to":1266.3,"location":2,"content":"That we're sort of summarizing,"},{"from":1266.3,"to":1268.46,"location":2,"content":"um, what we've detected there."},{"from":1268.46,"to":1274.4,"location":2,"content":"Um, so the concept of max pooling in some sense captures,"},{"from":1274.4,"to":1278.64,"location":2,"content":"does, is this thing being activated anywhere, right?"},{"from":1278.64,"to":1282.18,"location":2,"content":"So if we have things like politeness and about food,"},{"from":1282.18,"to":1285.51,"location":2,"content":"that the output of max pooling will have a high value."},{"from":1285.51,"to":1288.6,"location":2,"content":"If somewhere in the sentence there was a clear marker of"},{"from":1288.6,"to":1292.04,"location":2,"content":"politeness or something clearly about food."},{"from":1292.04,"to":1297.21,"location":2,"content":"And that's often a useful notion because often what you want to know is,"},{"from":1297.21,"to":1302.26,"location":2,"content":"you know, is there some discussion of food in this sentence or is there not?"},{"from":1302.26,"to":1306.15,"location":2,"content":"There's another thing, there are other things that you could do."},{"from":1306.15,"to":1308.63,"location":2,"content":"Instead of, ah, max pooling,"},{"from":1308.63,"to":1311.21,"location":2,"content":"you can instead do average pooling."},{"from":1311.21,"to":1315.4,"location":2,"content":"So here you just take these numbers and find the average of them."},{"from":1315.4,"to":1318.91,"location":2,"content":"That then has the different semantics which is sort of"},{"from":1318.91,"to":1322.6,"location":2,"content":"what's the average amount of politeness of this, um,"},{"from":1322.6,"to":1325.86,"location":2,"content":"text or on average how much, you know, how,"},{"from":1325.86,"to":1330.27,"location":2,"content":"what percent of the sentence is about food or something like that."},{"from":1330.27,"to":1332.19,"location":2,"content":"Um, for some purposes,"},{"from":1332.19,"to":1333.68,"location":2,"content":"this is better because, you know,"},{"from":1333.68,"to":1336.96,"location":2,"content":"it takes in all of the important builds to an average."},{"from":1336.96,"to":1338.9,"location":2,"content":"I mean, a lot of the time,"},{"from":1338.9,"to":1342.89,"location":2,"content":"people have found that actually max pooling is better because,"},{"from":1342.89,"to":1347.49,"location":2,"content":"you know, a lot of signals in natural language are sparse."},{"from":1347.49,"to":1350.63,"location":2,"content":"You know, no matter how polite you are trying to be,"},{"from":1350.63,"to":1352.94,"location":2,"content":"you're not going to be being polite in every word."},{"from":1352.94,"to":1357.43,"location":2,"content":"You're going to say nouns and articles like that and a,"},{"from":1357.43,"to":1360.39,"location":2,"content":"and prepositions and conjunctions,"},{"from":1360.39,"to":1362.63,"location":2,"content":"none of which are inherently polite, right?"},{"from":1362.63,"to":1366.33,"location":2,"content":"Um, so that if there's some politeness showing up prominently,"},{"from":1366.33,"to":1371.47,"location":2,"content":"then the sentence becomes polite and max pooling is actually better for capturing that."},{"from":1371.47,"to":1374.43,"location":2,"content":"Um, of course the one other kind of thing that you can do as"},{"from":1374.43,"to":1378.12,"location":2,"content":"min pooling and find the least [LAUGHTER] active thing."},{"from":1378.12,"to":1381.13,"location":2,"content":"Um, it doesn't get used much but you could do that as well."},{"from":1381.13,"to":1384.38,"location":2,"content":"Okay. So, um, so if you're in PyTorch,"},{"from":1384.38,"to":1387.37,"location":2,"content":"this is all pretty easy stuff to do."},{"from":1387.37,"to":1390.01,"location":2,"content":"So there's a handy dandy Conv1d."},{"from":1390.01,"to":1393.03,"location":2,"content":"There's also a Conv2d as you might guess for vision."},{"from":1393.03,"to":1395.01,"location":2,"content":"But there's a Conv1d, um,"},{"from":1395.01,"to":1398.79,"location":2,"content":"where you're specifying how many input channels there are."},{"from":1398.79,"to":1400.72,"location":2,"content":"That was our word embedding size."},{"from":1400.72,"to":1402.73,"location":2,"content":"How many output channels there are?"},{"from":1402.73,"to":1404.37,"location":2,"content":"We have three."},{"from":1404.37,"to":1407.82,"location":2,"content":"What the size of the convolutional kernel is?"},{"from":1407.82,"to":1409.53,"location":2,"content":"So the ones that we were showing were also"},{"from":1409.53,"to":1412.38,"location":2,"content":"three and then there are various other parameters you can have."},{"from":1412.38,"to":1415.99,"location":2,"content":"Like you can say that you want a padding of one and things like that."},{"from":1415.99,"to":1418.08,"location":2,"content":"And then once you've got one of those,"},{"from":1418.08,"to":1419.69,"location":2,"content":"you can just sort of run"},{"from":1419.69,"to":1424.36,"location":2,"content":"your convolutional filter on the input to get a new hidden state."},{"from":1424.36,"to":1426.22,"location":2,"content":"And then if you wanna max pool,"},{"from":1426.22,"to":1427.57,"location":2,"content":"you can just max,"},{"from":1427.57,"to":1431.75,"location":2,"content":"um, through the output of that and then you've got a max pooled output."},{"from":1431.75,"to":1438.87,"location":2,"content":"Okay. So that gives us the basics of building a kind of a convolutional neural network,"},{"from":1438.87,"to":1441.15,"location":2,"content":"um, for, um, NLP."},{"from":1441.15,"to":1446,"location":2,"content":"Does that sort of makes sense up until there?"},{"from":1446,"to":1450.57,"location":2,"content":"Yeah. Okay. So next bit is to sort of show"},{"from":1450.57,"to":1455.27,"location":2,"content":"you three or four other things that you can do."},{"from":1455.27,"to":1458.33,"location":2,"content":"Um, I started off typing these slides"},{"from":1458.33,"to":1460.92,"location":2,"content":"other less useful notions because I"},{"from":1460.92,"to":1463.59,"location":2,"content":"kinda thought, oh, at least they don't really come up much in NLP."},{"from":1463.59,"to":1468.09,"location":2,"content":"But, you know, actually it turned out when I got on to that second paper,"},{"from":1468.09,"to":1472.74,"location":2,"content":"when I say the complex convolutional neural network, actually,"},{"from":1472.74,"to":1477.75,"location":2,"content":"in that paper they try out just about all of these things that I say no one uses."},{"from":1477.75,"to":1482.14,"location":2,"content":"So it's sort of good to know what they are for looking at various papers."},{"from":1482.14,"to":1489.8,"location":2,"content":"So here, when we did things so far then we were calculating these convolutions,"},{"from":1489.8,"to":1492.66,"location":2,"content":"that we're sort of trying them out at every position."},{"from":1492.66,"to":1495.29,"location":2,"content":"So we had one for zero, tentative deal."},{"from":1495.29,"to":1498.42,"location":2,"content":"Then for tentative deal reached then deal reached to."},{"from":1498.42,"to":1500.97,"location":2,"content":"And so we were just walking down one step at"},{"from":1500.97,"to":1504.77,"location":2,"content":"a time which is referred to as a stride as, of one."},{"from":1504.77,"to":1508.1,"location":2,"content":"And that's by far the most common thing to do."},{"from":1508.1,"to":1509.6,"location":2,"content":"But you could observe,"},{"from":1509.6,"to":1510.83,"location":2,"content":"look wait a minute,"},{"from":1510.83,"to":1515.65,"location":2,"content":"since the first convolution concerns zero tentative deal."},{"from":1515.65,"to":1518.09,"location":2,"content":"I've got all those three words in there."},{"from":1518.09,"to":1525.22,"location":2,"content":"Even if I skip down to a next did, deal reach to and then I did to keep government,"},{"from":1525.22,"to":1530.46,"location":2,"content":"I'd still have in one or other of the convolutions every word of the sentence"},{"from":1530.46,"to":1532.95,"location":2,"content":"so I can do half as much computation and I've"},{"from":1532.95,"to":1535.63,"location":2,"content":"still got everything in there in some sense."},{"from":1535.63,"to":1538.42,"location":2,"content":"And so that's referred to as using a stride of two."},{"from":1538.42,"to":1542.13,"location":2,"content":"And so then I get something with half as many rows out."},{"from":1542.13,"to":1546.84,"location":2,"content":"So it's one way to sort of compactify your representation and produce"},{"from":1546.84,"to":1552.86,"location":2,"content":"something shorter from a longer sentence and we'll see that use of it coming up later."},{"from":1552.86,"to":1559.89,"location":2,"content":"There's other ways to compactify what cut representation that comes out of your sentence."},{"from":1559.89,"to":1565.71,"location":2,"content":"And so there's a different notion of pooling which is local pooling."},{"from":1565.71,"to":1569.64,"location":2,"content":"Now, if if you've seen any of"},{"from":1569.64,"to":1573.51,"location":2,"content":"the vision world when people talk about max pooling and vision,"},{"from":1573.51,"to":1576.96,"location":2,"content":"they normally mean local pooling as opposed to"},{"from":1576.96,"to":1581.4,"location":2,"content":"the max pooling through time that I showed you first."},{"from":1581.4,"to":1587.07,"location":2,"content":"So here we're sort of back to where we started and we've done"},{"from":1587.07,"to":1593.54,"location":2,"content":"our size three stride one convolution which is producing output as before."},{"from":1593.54,"to":1599.31,"location":2,"content":"But now, what I'm gonna do is local pool with a stride of two."},{"from":1599.31,"to":1604.65,"location":2,"content":"Which means I'm gonna take each two rows and I'm gonna pool them together into"},{"from":1604.65,"to":1607.11,"location":2,"content":"one row and I could do that again by"},{"from":1607.11,"to":1610.68,"location":2,"content":"either maxing or averaging or whatever appeals to me."},{"from":1610.68,"to":1613.2,"location":2,"content":"So I take the first two rows,"},{"from":1613.2,"to":1614.97,"location":2,"content":"I max pool them I get this."},{"from":1614.97,"to":1616.8,"location":2,"content":"I take the next two rows,"},{"from":1616.8,"to":1618.56,"location":2,"content":"I max pool them I get this."},{"from":1618.56,"to":1621.42,"location":2,"content":"Next two, next two and I sort of pad it"},{"from":1621.42,"to":1624.29,"location":2,"content":"on the bottom so I have two rows at the bottom."},{"from":1624.29,"to":1629.41,"location":2,"content":"And so that's then give me a local max pooling of a stride of two."},{"from":1629.41,"to":1633.3,"location":2,"content":"And that sort of had exactly the same effect in the sense but"},{"from":1633.3,"to":1636.99,"location":2,"content":"with a different result as using a stride of two in"},{"from":1636.99,"to":1640.53,"location":2,"content":"my convolution because I have again reduced it to"},{"from":1640.53,"to":1646.97,"location":2,"content":"something of four rows that used to be eight rows."},{"from":1646.97,"to":1649.93,"location":2,"content":"Yeah, picture that."},{"from":1649.93,"to":1653.64,"location":2,"content":"Okay so that's that one."},{"from":1653.64,"to":1655.41,"location":2,"content":"What else can you do."},{"from":1655.41,"to":1658.08,"location":2,"content":"There are more things you can do to make it complex."},{"from":1658.08,"to":1663.77,"location":2,"content":"Another thing that people have sometimes done is k-max pooling."},{"from":1663.77,"to":1669.51,"location":2,"content":"And so this is a more complex thing and it's sort of saying well,"},{"from":1669.51,"to":1673.53,"location":2,"content":"rather than just keeping the max over time,"},{"from":1673.53,"to":1680.33,"location":2,"content":"if a feature is being kind of activated two or three times in the sentence,"},{"from":1680.33,"to":1683.64,"location":2,"content":"maybe it'd be good to record all the times that it's"},{"from":1683.64,"to":1687.38,"location":2,"content":"activated in the sentence while throwing away the rest."},{"from":1687.38,"to":1689.07,"location":2,"content":"So in k-max pooling,"},{"from":1689.07,"to":1690.87,"location":2,"content":"and I'm doing two max here,"},{"from":1690.87,"to":1697.34,"location":2,"content":"you look down this column and you find the two highest values for that column."},{"from":1697.34,"to":1703.66,"location":2,"content":"But then you put the two highest values not in the order of highest to lowest,"},{"from":1703.66,"to":1706.62,"location":2,"content":"but in the order in which they are in these columns."},{"from":1706.62,"to":1708.84,"location":2,"content":"So it's minus 0.2,"},{"from":1708.84,"to":1712.23,"location":2,"content":"0.3 for this one and it's 1.6,"},{"from":1712.23,"to":1718.07,"location":2,"content":"0.6 for this one because it reflects the orders of the columns up above."},{"from":1718.07,"to":1723.21,"location":2,"content":"Okay. Almost done, one more concept."},{"from":1723.21,"to":1732.29,"location":2,"content":"This is another way of compressing data which is a dilated convolution."},{"from":1732.29,"to":1735.32,"location":2,"content":"So if you have a dilated convolution,"},{"from":1735.32,"to":1741.87,"location":2,"content":"so dilated convolution doing it over here doesn't really make sense but where you can use"},{"from":1741.87,"to":1748.44,"location":2,"content":"a dilated convolution is if I take this and put it through another convolutional layer,"},{"from":1748.44,"to":1753.54,"location":2,"content":"we can kind of have deep convolutional networks that have multiple convolutional layers."},{"from":1753.54,"to":1760.56,"location":2,"content":"So the idea of a dilated convolution issue is you're gonna skip some of the rows."},{"from":1760.56,"to":1764.3,"location":2,"content":"So if you use a dilation of two starting at the top,"},{"from":1764.3,"to":1767.46,"location":2,"content":"you're going to take the first, third,"},{"from":1767.46,"to":1771.87,"location":2,"content":"and the fifth row and multiply them by my fil- sorry,"},{"from":1771.87,"to":1772.98,"location":2,"content":"I have different filters."},{"from":1772.98,"to":1778.31,"location":2,"content":"Multiply them by my filters and then get the values that appear here."},{"from":1778.31,"to":1780.48,"location":2,"content":"And then if stride as one,"},{"from":1780.48,"to":1786.9,"location":2,"content":"you'd then use, you would go on and sort of do the next spread out rows."},{"from":1786.9,"to":1791.03,"location":2,"content":"And so this allows you to have convolutions that see"},{"from":1791.03,"to":1796.68,"location":2,"content":"a bigger spread of the sentence without having many parameters."},{"from":1796.68,"to":1799.07,"location":2,"content":"So you don't have to do things this way."},{"from":1799.07,"to":1800.67,"location":2,"content":"You could have said, look,"},{"from":1800.67,"to":1807.02,"location":2,"content":"I could just instead have convolutions with a kernel size of five."},{"from":1807.02,"to":1808.47,"location":2,"content":"And then they'd say five,"},{"from":1808.47,"to":1811.5,"location":2,"content":"see five words in a row but then I'd be having"},{"from":1811.5,"to":1817.23,"location":2,"content":"sort of bigger matrices to specify my feature."},{"from":1817.23,"to":1820.77,"location":2,"content":"Whereas, this way I can keep the matrices small but still"},{"from":1820.77,"to":1825.11,"location":2,"content":"see a bigger range of the sentence in one operation."},{"from":1825.11,"to":1830.67,"location":2,"content":"Yeah and that concept of how much of a sentence you"},{"from":1830.67,"to":1836.49,"location":2,"content":"see is kind of an important notion in convolutional neural networks."},{"from":1836.49,"to":1839.94,"location":2,"content":"Because, you know, if you start at the beginning of a sentence"},{"from":1839.94,"to":1843.78,"location":2,"content":"and you're just running three-by-three convolutions, um,"},{"from":1843.78,"to":1847.99,"location":2,"content":"you're sort of seeing these three word patches of the sentence."},{"from":1847.99,"to":1850.35,"location":2,"content":"And it turns out in natural language that's"},{"from":1850.35,"to":1853.31,"location":2,"content":"already actually quite a useful representation."},{"from":1853.31,"to":1856.92,"location":2,"content":"Because sort of having those kind of n-grams as features is"},{"from":1856.92,"to":1861.16,"location":2,"content":"just good for many purposes including text classification."},{"from":1861.16,"to":1865.68,"location":2,"content":"But if you want to sort of understand more of the semantics of a sentence,"},{"from":1865.68,"to":1868.58,"location":2,"content":"somehow you wanna see more of that at once."},{"from":1868.58,"to":1873.78,"location":2,"content":"And you've sort of got several tools you can use to see more of it once,"},{"from":1873.78,"to":1875.73,"location":2,"content":"you can use bigger filters,"},{"from":1875.73,"to":1876.87,"location":2,"content":"you could use, uh,"},{"from":1876.87,"to":1878.46,"location":2,"content":"kernel size five, seven,"},{"from":1878.46,"to":1880.65,"location":2,"content":"nine or something convolution."},{"from":1880.65,"to":1885.59,"location":2,"content":"You could do something like dilated convolution so you can see spread out pictures."},{"from":1885.59,"to":1888.12,"location":2,"content":"And the third thing that you can do is you"},{"from":1888.12,"to":1890.84,"location":2,"content":"can have depth of a convolutional neural network."},{"from":1890.84,"to":1895.61,"location":2,"content":"Because as you have greater depth of a convolutional neural network, you see more."},{"from":1895.61,"to":1897.69,"location":2,"content":"So at this first layer,"},{"from":1897.69,"to":1903.15,"location":2,"content":"the rows now have sort of info about three words in them."},{"from":1903.15,"to":1906.63,"location":2,"content":"And if you sort of just stuck a second layer of"},{"from":1906.63,"to":1908.28,"location":2,"content":"convolutional neural network with"},{"from":1908.28,"to":1911.67,"location":2,"content":"the same general nature on top of it and you sort of take"},{"from":1911.67,"to":1915.45,"location":2,"content":"the first three rows and convolve it again then and"},{"from":1915.45,"to":1920.94,"location":2,"content":"then the next ones that those then know about five words of your original input sentence."},{"from":1920.94,"to":1923.7,"location":2,"content":"So as you kind of have a deeper ConvNet stack you"},{"from":1923.7,"to":1927.49,"location":2,"content":"start to know about bigger and bigger patches of the sentence."},{"from":1927.49,"to":1929.97,"location":2,"content":"Okay. All good?"},{"from":1929.97,"to":1934.76,"location":2,"content":"Any questions?"},{"from":1934.76,"to":1942.9,"location":2,"content":"No, that's good, okay. So, um, the next piece is essentially shows you this stuff again,"},{"from":1942.9,"to":1946.56,"location":2,"content":"um, in the context of a particular paper."},{"from":1946.56,"to":1947.85,"location":2,"content":"So this was, um,"},{"from":1947.85,"to":1952.13,"location":2,"content":"a paper by Yoon Kim who was a Harvard student,"},{"from":1952.13,"to":1956.46,"location":2,"content":"maybe still is a Harvard student, um, in 2014."},{"from":1956.46,"to":1959.79,"location":2,"content":"So this was sort of a fairly early paper."},{"from":1959.79,"to":1965.52,"location":2,"content":"Um, and he wanted to show that you could use convolutional neural networks to do"},{"from":1965.52,"to":1967.5,"location":2,"content":"a good job for doing"},{"from":1967.5,"to":1972.24,"location":2,"content":"text classification when what you want to classify is a single sentence."},{"from":1972.24,"to":1975.75,"location":2,"content":"So, the kind of thing you might want to do is look at the kind of"},{"from":1975.75,"to":1980.4,"location":2,"content":"snippets of movie reviews that you see on the Rotten Tomatoes site and say,"},{"from":1980.4,"to":1984.9,"location":2,"content":"\"Is this a positive or is this a negative sentence description?\""},{"from":1984.9,"to":1988.15,"location":2,"content":"And the model he built is actually kind of similar"},{"from":1988.15,"to":1991.69,"location":2,"content":"to the convolutional neural networks that Collobert and Weston,"},{"from":1991.69,"to":1994.98,"location":2,"content":"um, introduced in their 2011 paper that we"},{"from":1994.98,"to":1998.1,"location":2,"content":"mentioned before when we were talking about window-based classifiers."},{"from":1998.1,"to":2000.5,"location":2,"content":"So, in their paper they actually use"},{"from":2000.5,"to":2005.6,"location":2,"content":"both window-based classifiers and the convolutional classifier."},{"from":2005.6,"to":2008.57,"location":2,"content":"Okay. Um, so yeah,"},{"from":2008.57,"to":2009.8,"location":2,"content":"I sort of already said this."},{"from":2009.8,"to":2014.21,"location":2,"content":"So their tasks are sentence classification, could be sentiment."},{"from":2014.21,"to":2015.88,"location":2,"content":"It could be other things like,"},{"from":2015.88,"to":2019.1,"location":2,"content":"is this sentence subjective or objective?"},{"from":2019.1,"to":2022.04,"location":2,"content":"So objective is what the main news articles are meant"},{"from":2022.04,"to":2025.3,"location":2,"content":"to be and subjective is what the opinion pieces are meant to be."},{"from":2025.3,"to":2028.97,"location":2,"content":"Um, and then other things like question classification."},{"from":2028.97,"to":2031.22,"location":2,"content":"Is this a question asking about a person,"},{"from":2031.22,"to":2033.2,"location":2,"content":"location, number, or whatever?"},{"from":2033.2,"to":2037.4,"location":2,"content":"Okay, so here is what he did."},{"from":2037.4,"to":2041.49,"location":2,"content":"And it's sort of the- these slides sort of, um,"},{"from":2041.49,"to":2046.88,"location":2,"content":"use the notation of his paper which is sort of a little bit different the"},{"from":2046.88,"to":2049.31,"location":2,"content":"way the math gets written down to what I just showed"},{"from":2049.31,"to":2052.16,"location":2,"content":"you, that it's really doing exactly the same thing."},{"from":2052.16,"to":2056.93,"location":2,"content":"So we start with word vectors of length k. Um,"},{"from":2056.93,"to":2064.61,"location":2,"content":"the sentence is made by just concatenating all of those word vectors together and then,"},{"from":2064.61,"to":2067.28,"location":2,"content":"when we- so we have a range of words,"},{"from":2067.28,"to":2070.19,"location":2,"content":"it's a subpart of that sentence vector."},{"from":2070.19,"to":2076.31,"location":2,"content":"And so, the convolutional filter is just being represented as a vector because"},{"from":2076.31,"to":2082.1,"location":2,"content":"here he's flattened everything out into one long vector for the entire sentence,"},{"from":2082.1,"to":2084.51,"location":2,"content":"whereas I'd sort of stepped into a matrix."},{"from":2084.51,"to":2091.07,"location":2,"content":"Um, so a size three convolution is just a real vector of length hk,"},{"from":2091.07,"to":2096.35,"location":2,"content":"the size of the convolutional filter times the dimensionality of the words."},{"from":2096.35,"to":2101.21,"location":2,"content":"Um, and so, what he's gonna do to build"},{"from":2101.21,"to":2107.45,"location":2,"content":"his text classifier is use convolutions made out of different sizes."},{"from":2107.45,"to":2110.76,"location":2,"content":"So you can have size two convolutions,"},{"from":2110.76,"to":2116,"location":2,"content":"size three convolutions as shown here, and bigger convolutions."},{"from":2116,"to":2123.14,"location":2,"content":"And so, um, so to compute a feature one channel for our CNN, we're"},{"from":2123.14,"to":2126.62,"location":2,"content":"then doing a dot product between the weight vector of"},{"from":2126.62,"to":2130.41,"location":2,"content":"the feature times this sub-sequence of the same terms,"},{"from":2130.41,"to":2135.03,"location":2,"content":"and he sort of also put in a bias which I sort of omitted."},{"from":2135.03,"to":2141.11,"location":2,"content":"Um, and then putting it through a non-linearity,"},{"from":2141.11,"to":2143.39,"location":2,"content":"um, which I wasn't doing either."},{"from":2143.39,"to":2146.05,"location":2,"content":"Um, but as sort of we've seen a ton of."},{"from":2146.05,"to":2149.81,"location":2,"content":"Um, and so, what we're wanting to do is that's our,"},{"from":2149.81,"to":2153.41,"location":2,"content":"um, feature and we want to, um,"},{"from":2153.41,"to":2158.15,"location":2,"content":"do it through all this- for a feature of kernel size three,"},{"from":2158.15,"to":2160.88,"location":2,"content":"we're gonna go all the way through the sentence."},{"from":2160.88,"to":2164.74,"location":2,"content":"The other thing he did though was slightly funnel funny is,"},{"from":2164.74,"to":2168.92,"location":2,"content":"his windows were sort of lopsided in the notation, right."},{"from":2168.92,"to":2171.7,"location":2,"content":"There's a word and th- the,"},{"from":2171.7,"to":2175.36,"location":2,"content":"um, h minus 1 words to the right of it."},{"from":2175.36,"to":2180.09,"location":2,"content":"So he has padding here just on the right end whereas"},{"from":2180.09,"to":2185.81,"location":2,"content":"most people do their convolutions symmetrically in both directions around things."},{"from":2185.81,"to":2191.63,"location":2,"content":"Okay. And so, we're going to do that for a bunch of features or"},{"from":2191.63,"to":2194.48,"location":2,"content":"channels Ci and therefore compute"},{"from":2194.48,"to":2198.68,"location":2,"content":"our convolved representations just as we've talked about."},{"from":2198.68,"to":2203.43,"location":2,"content":"Okay. Um, then he does just what we talked about."},{"from":2203.43,"to":2208.37,"location":2,"content":"Um, there's max over time pooling in the pooling layer to capture"},{"from":2208.37,"to":2213.65,"location":2,"content":"the most relevant things and is giving us a single number for each channel."},{"from":2213.65,"to":2221.47,"location":2,"content":"Um, and we have features that look at different that have different kernel sizes."},{"from":2221.47,"to":2228.23,"location":2,"content":"Um, here's one other idea he used which is possibly a neat idea."},{"from":2228.23,"to":2233.66,"location":2,"content":"Um, he knows one of the things that you could even think about in various ways,"},{"from":2233.66,"to":2237.35,"location":2,"content":"um, for say a question answering system among other things."},{"from":2237.35,"to":2241.61,"location":2,"content":"Um, and so he used pre-trained word vectors."},{"from":2241.61,"to":2248.98,"location":2,"content":"Um, but what he did was he actually kind of doubled the word vectors."},{"from":2248.98,"to":2252.47,"location":2,"content":"So, for each word he had two copies of the word vector,"},{"from":2252.47,"to":2257.29,"location":2,"content":"and so you have sort of two channel sets and one set he"},{"from":2257.29,"to":2262.38,"location":2,"content":"froze and the other one he fine tuned as he trained."},{"from":2262.38,"to":2266.59,"location":2,"content":"So it's sort of he tried to get the best of both worlds of sort of fine tuning"},{"from":2266.59,"to":2271.77,"location":2,"content":"and not fine tuning and all that went into the max pooling operation."},{"from":2271.77,"to":2281.61,"location":2,"content":"Okay. Um, so, after the max pooling we get out one number for each channel and so,"},{"from":2281.61,"to":2286.76,"location":2,"content":"um, he has something of three size convolutions, three,"},{"from":2286.76,"to":2290.39,"location":2,"content":"four, five, 100 features for each size."},{"from":2290.39,"to":2293.43,"location":2,"content":"So we're getting out a vector of size,"},{"from":2293.43,"to":2295.67,"location":2,"content":"um, 300 at that point,"},{"from":2295.67,"to":2299.81,"location":2,"content":"and at that point you're taking that final vector and just sticking it"},{"from":2299.81,"to":2304.59,"location":2,"content":"through a softmax and that's then giving your classification of the classes."},{"from":2304.59,"to":2311.49,"location":2,"content":"Um, so all of that can be summarized in this picture if it's big enough to sort of read."},{"from":2311.49,"to":2312.8,"location":2,"content":"So, here's our sentence."},{"from":2312.8,"to":2314.86,"location":2,"content":"I like this movie very much,"},{"from":2314.86,"to":2319.31,"location":2,"content":"which has you know, our word embedding dimension is five,"},{"from":2319.31,"to":2322.26,"location":2,"content":"and so then doing it in this example,"},{"from":2322.26,"to":2326.93,"location":2,"content":"we are having two channels for each kernel size and"},{"from":2326.93,"to":2332.03,"location":2,"content":"we consider kernels of size two, three, and four."},{"from":2332.03,"to":2337.2,"location":2,"content":"Um, and so and then we are getting two different ones."},{"from":2337.2,"to":2341.61,"location":2,"content":"Um, so we're getting, um, six."},{"from":2341.61,"to":2344.41,"location":2,"content":"This is showing six of our filters."},{"from":2344.41,"to":2347.18,"location":2,"content":"Um, so we apply those."},{"from":2347.18,"to":2350.97,"location":2,"content":"When we- when we apply those filters without any padding,"},{"from":2350.97,"to":2355.88,"location":2,"content":"we are then getting out these outputs of the filters which are of sizes four,"},{"from":2355.88,"to":2358.99,"location":2,"content":"five, and six respectively."},{"from":2358.99,"to":2363.07,"location":2,"content":"Um, and so then once we've got these"},{"from":2363.07,"to":2367.26,"location":2,"content":"for each of these sets of numbers we're doing one max pooling."},{"from":2367.26,"to":2370.88,"location":2,"content":"So, we're just taking the max of each of these,"},{"from":2370.88,"to":2376.72,"location":2,"content":"um, output features which gives us these six numbers."},{"from":2376.72,"to":2383.06,"location":2,"content":"Um, we can concatenate them all together into one vector which we feed into,"},{"from":2383.06,"to":2392.58,"location":2,"content":"um, a softmax over two classes as to whether sentiment is positive or negative."},{"from":2392.58,"to":2395.59,"location":2,"content":"Um, so that's basically the model."},{"from":2395.59,"to":2401.2,"location":2,"content":"So something- so this is sort of really actually a very simple,"},{"from":2401.2,"to":2403.63,"location":2,"content":"very computationally efficient, uh,"},{"from":2403.63,"to":2406.78,"location":2,"content":"model as to how to build a text classifier."},{"from":2406.78,"to":2413.16,"location":2,"content":"[NOISE] Um, yeah, just a couple more things to get through,"},{"from":2413.16,"to":2415.21,"location":2,"content":"um, so in one of the assignments,"},{"from":2415.21,"to":2417.7,"location":2,"content":"we talked about Dropout [NOISE] and you used it."},{"from":2417.7,"to":2419.07,"location":2,"content":"So, um, you know,"},{"from":2419.07,"to":2421.7,"location":2,"content":"hopefully you're all masters of Dropout at this point."},{"from":2421.7,"to":2424.72,"location":2,"content":"Um, so he was using Dropout, um,"},{"from":2424.72,"to":2428.18,"location":2,"content":"and this being 2014 and the,"},{"from":2428.18,"to":2431.82,"location":2,"content":"um, Dropout paper only coming out in 2014."},{"from":2431.82,"to":2434.89,"location":2,"content":"I guess, there'd been an earlier version that came out a couple of years earlier."},{"from":2434.89,"to":2437.16,"location":2,"content":"This was sort of still fairly early,"},{"from":2437.16,"to":2439.43,"location":2,"content":"um, to be taking advantage of Dropout."},{"from":2439.43,"to":2441.14,"location":2,"content":"So that while training,"},{"from":2441.14,"to":2444.11,"location":2,"content":"you've got this sort of Dropout vector, um,"},{"from":2444.11,"to":2449.01,"location":2,"content":"where you sample your Bernoulli random variables and you're, sort of,"},{"from":2449.01,"to":2454.82,"location":2,"content":"um, sort of, designed to drop out some of the features each time you are doing things."},{"from":2454.82,"to":2458.2,"location":2,"content":"At testing time, you don't do the dropout,"},{"from":2458.2,"to":2462.13,"location":2,"content":"but because before you were sort of dropping out a lot of stuff,"},{"from":2462.13,"to":2467.45,"location":2,"content":"you're scaling your weight matrix by the same probability that you use for dropping out,"},{"from":2467.45,"to":2469,"location":2,"content":"so that you get, sort of,"},{"from":2469,"to":2472,"location":2,"content":"vectors of the same scale as before."},{"from":2472,"to":2475.07,"location":2,"content":"Um, so as we sort of discussed in the assignment,"},{"from":2475.07,"to":2478.42,"location":2,"content":"Dropout is a really effective form of regularization,"},{"from":2478.42,"to":2480.58,"location":2,"content":"widely used in neural networks."},{"from":2480.58,"to":2483.7,"location":2,"content":"Um, he didn't only do that, he actually did,"},{"from":2483.7,"to":2487.6,"location":2,"content":"a kind of another sort of funky form of regularization."},{"from":2487.6,"to":2491.43,"location":2,"content":"So that's for the softmax weight vector,"},{"from":2491.43,"to":2495.28,"location":2,"content":"he constrained the L2 norms,"},{"from":2495.28,"to":2501.1,"location":2,"content":"so the squared norms of the weight vectors and the softmax, [NOISE] um,"},{"from":2501.1,"to":2505.41,"location":2,"content":"matrix, um, to a fixed number S,"},{"from":2505.41,"to":2507.46,"location":2,"content":"which was sort of set of the hyper-parameters,"},{"from":2507.46,"to":2509.51,"location":2,"content":"actually set to the value three."},{"from":2509.51,"to":2513.05,"location":2,"content":"Um, and if your weights were getting too large,"},{"from":2513.05,"to":2515.59,"location":2,"content":"they were being rescaled,"},{"from":2515.59,"to":2517.34,"location":2,"content":"um, so they didn't blow up."},{"from":2517.34,"to":2520.21,"location":2,"content":"Um, this isn't a very common thing to do."},{"from":2520.21,"to":2523.69,"location":2,"content":"I'm not sure it's very necessary, um, but, um,"},{"from":2523.69,"to":2525.85,"location":2,"content":"I guess it gives you some- I mean,"},{"from":2525.85,"to":2529.05,"location":2,"content":"I guess by showing you a few of the details of this one,"},{"from":2529.05,"to":2530.59,"location":2,"content":"my hope is, sort of,"},{"from":2530.59,"to":2533.68,"location":2,"content":"gives you some ideas about how there are lots of things you can play"},{"from":2533.68,"to":2537.03,"location":2,"content":"around with and muck with if you wanna try different things,"},{"from":2537.03,"to":2539.02,"location":2,"content":"um, for your final projects."},{"from":2539.02,"to":2541,"location":2,"content":"Um, okay."},{"from":2541,"to":2544.12,"location":2,"content":"So here are some of his final hyperparameters."},{"from":2544.12,"to":2547.36,"location":2,"content":"So he's using ReLU nonlinearities,"},{"from":2547.36,"to":2550.76,"location":2,"content":"um, window sizes of three, four, and five,"},{"from":2550.76,"to":2555.79,"location":2,"content":"the convolutions, hundred features or channels for each size,"},{"from":2555.79,"to":2558.53,"location":2,"content":"um, Dropout of a half as usual."},{"from":2558.53,"to":2561.86,"location":2,"content":"Um, you get several percentage improvements from dropout,"},{"from":2561.86,"to":2563.86,"location":2,"content":"which is quite common actually."},{"from":2563.86,"to":2567.84,"location":2,"content":"Um, the sort of L2 constraint, s equals three,"},{"from":2567.84,"to":2570.18,"location":2,"content":"mini batch of 50,"},{"from":2570.18,"to":2572.64,"location":2,"content":"300 dimensional word vectors,"},{"from":2572.64,"to":2575.76,"location":2,"content":"train to maximize dev set performance."},{"from":2575.76,"to":2578.83,"location":2,"content":"Okay. And here is the big table,"},{"from":2578.83,"to":2580.69,"location":2,"content":"you know, I was too lazy, um,"},{"from":2580.69,"to":2586.57,"location":2,"content":"to redo of performance on these different text classification data sets."},{"from":2586.57,"to":2588.46,"location":2,"content":"Um, there are lots of different ones."},{"from":2588.46,"to":2591.82,"location":2,"content":"So these two are both Stanford Sentiment Treebank."},{"from":2591.82,"to":2594.57,"location":2,"content":"This is the Subjective Objective Language."},{"from":2594.57,"to":2599.65,"location":2,"content":"This is the Question Classification, of is it asking for a person name and location,"},{"from":2599.65,"to":2600.79,"location":2,"content":"a company or whatever."},{"from":2600.79,"to":2604.15,"location":2,"content":"Um, this is, um,"},{"from":2604.15,"to":2606.28,"location":2,"content":"talking about, sort of, a perspective,"},{"from":2606.28,"to":2608.34,"location":2,"content":"which is another classification thing."},{"from":2608.34,"to":2610.89,"location":2,"content":"Consumer Reports is another sentiment one."},{"from":2610.89,"to":2616.21,"location":2,"content":"Um, so lots of data sets and then here are lots of models."},{"from":2616.21,"to":2621.58,"location":2,"content":"So the model- some of the models down here or here,"},{"from":2621.58,"to":2626.02,"location":2,"content":"are traditional feature-based, um, classifiers."},{"from":2626.02,"to":2628,"location":2,"content":"Um, so in particular,"},{"from":2628,"to":2632.23,"location":2,"content":"um, sort of Wang and me back in 2012,"},{"from":2632.23,"to":2636.03,"location":2,"content":"had sort of pointed out that by taking certain steps"},{"from":2636.03,"to":2640.72,"location":2,"content":"with n-gram features and other forms of normalization,"},{"from":2640.72,"to":2643.42,"location":2,"content":"that you could actually get quite good results with"},{"from":2643.42,"to":2646.96,"location":2,"content":"just the traditional feature, um, based classifiers."},{"from":2646.96,"to":2652.05,"location":2,"content":"So many people use that as a baseline for showing that you can do better things."},{"from":2652.05,"to":2654.36,"location":2,"content":"Um, the ones up here,"},{"from":2654.36,"to":2658.2,"location":2,"content":"were tree structured neural networks that my group was very fond"},{"from":2658.2,"to":2662.8,"location":2,"content":"of in the early 2010s and then up at the very top,"},{"from":2662.8,"to":2664.7,"location":2,"content":"uh, his CNN models."},{"from":2664.7,"to":2666.51,"location":2,"content":"And as you can see,"},{"from":2666.51,"to":2667.88,"location":2,"content":"it's sort of a mix."},{"from":2667.88,"to":2670.87,"location":2,"content":"Sometimes the CNN model wins,"},{"from":2670.87,"to":2673.01,"location":2,"content":"like in this column and this column,"},{"from":2673.01,"to":2676.01,"location":2,"content":"sometimes it doesn't win like in these columns."},{"from":2676.01,"to":2678.01,"location":2,"content":"Um, but in general, um,"},{"from":2678.01,"to":2680.26,"location":2,"content":"what you didn't see from this is that, you know,"},{"from":2680.26,"to":2683.14,"location":2,"content":"this is an extremely simple, um,"},{"from":2683.14,"to":2686.34,"location":2,"content":"convolutional neural network model and it actually does,"},{"from":2686.34,"to":2688.72,"location":2,"content":"um, kind of well on this system."},{"from":2688.72,"to":2694.72,"location":2,"content":"Um, you can quibble with this results table,"},{"from":2694.72,"to":2701.28,"location":2,"content":"and again in terms of like writing your propos- project proposal, um,"},{"from":2701.28,"to":2707.25,"location":2,"content":"one thing that you should do is kind of think about what you're reading, um,"},{"from":2707.25,"to":2710.1,"location":2,"content":"because, you know, a lot of papers aren't perfect"},{"from":2710.1,"to":2713.13,"location":2,"content":"and there are reasons to quibble with what they claim."},{"from":2713.13,"to":2717.78,"location":2,"content":"And sometimes if you think about what they're claiming and whether it's reasonable, um,"},{"from":2717.78,"to":2720.89,"location":2,"content":"there are reasons why it's not or there are ideas"},{"from":2720.89,"to":2724.41,"location":2,"content":"of how you could do things differently or show something different."},{"from":2724.41,"to":2727.32,"location":2,"content":"I mean, the main reason why you could quibble with,"},{"from":2727.32,"to":2731.36,"location":2,"content":"um, Yoon Kim's results table is, well,"},{"from":2731.36,"to":2735.39,"location":2,"content":"he already said, as I had a couple of slides back, um,"},{"from":2735.39,"to":2737.98,"location":2,"content":"that the statement that Dropout gives you"},{"from":2737.98,"to":2741.22,"location":2,"content":"two to four percent accuracy improvement in this neural nets."},{"from":2741.22,"to":2745.21,"location":2,"content":"[NOISE] Um, but most of these systems because they"},{"from":2745.21,"to":2749.36,"location":2,"content":"are older and were done before Dropout was invented,"},{"from":2749.36,"to":2751.39,"location":2,"content":"um, didn't make use of Dropout."},{"from":2751.39,"to":2755.17,"location":2,"content":"But, you know, any of these sort of neural net systems up here"},{"from":2755.17,"to":2759.45,"location":2,"content":"could have used Dropout and presumably it would have given them a couple of,"},{"from":2759.45,"to":2761.14,"location":2,"content":"um, percent gain as well."},{"from":2761.14,"to":2765.39,"location":2,"content":"So arguably, this is sort of a biased, unfair comparison."},{"from":2765.39,"to":2770.64,"location":2,"content":"And the right thing would have been to be comparing all the systems, um, using Dropout."},{"from":2770.64,"to":2772.12,"location":2,"content":"Um, but, you know,"},{"from":2772.12,"to":2773.89,"location":2,"content":"despite that, you know,"},{"from":2773.89,"to":2776.98,"location":2,"content":"this was still a prett- a lot of people noticed"},{"from":2776.98,"to":2780.82,"location":2,"content":"this paper because it showed that using this sort of very simple,"},{"from":2780.82,"to":2783.19,"location":2,"content":"very fast convolutional architecture,"},{"from":2783.19,"to":2788.25,"location":2,"content":"could give you strong results for text classification."},{"from":2788.25,"to":2791.01,"location":2,"content":"Um, that's that."},{"from":2791.01,"to":2793.76,"location":2,"content":"Yes. So in summary,"},{"from":2793.76,"to":2798.47,"location":2,"content":"you know, something that you should be thinking about for projects and otherwise,"},{"from":2798.47,"to":2804.37,"location":2,"content":"we're effectively building up a bigger toolkit of different tools you could be using,"},{"from":2804.37,"to":2808.14,"location":2,"content":"um, for projects or future work or whatever it is."},{"from":2808.14,"to":2809.64,"location":2,"content":"So starting off with,"},{"from":2809.64,"to":2813.25,"location":2,"content":"we had word vectors and then we could build bag of"},{"from":2813.25,"to":2817.11,"location":2,"content":"vector models by just taking the word vectors and averaging them."},{"from":2817.11,"to":2821.08,"location":2,"content":"And, you know, that's actually a surprisingly good baseline to start with."},{"from":2821.08,"to":2823.96,"location":2,"content":"We suggest to you in many cases for things like projects,"},{"from":2823.96,"to":2825.09,"location":2,"content":"you should use that."},{"from":2825.09,"to":2826.27,"location":2,"content":"See how well it does,"},{"from":2826.27,"to":2827.97,"location":2,"content":"make sure you're working better."},{"from":2827.97,"to":2830.61,"location":2,"content":"I mean particularly, you can do even better with that,"},{"from":2830.61,"to":2834.49,"location":2,"content":"if you sort of add some extra ReLU layers on top,"},{"from":2834.49,"to":2838.01,"location":2,"content":"which is an idea that's been explored in deep averaging networks."},{"from":2838.01,"to":2842.29,"location":2,"content":"Um, then we looked at window models which were very simple."},{"from":2842.29,"to":2843.85,"location":2,"content":"You're just taking these sort of"},{"from":2843.85,"to":2847.59,"location":2,"content":"five word windows and computing a feed-forward network on them,"},{"from":2847.59,"to":2852.84,"location":2,"content":"and they work very well for word classification problems that only need local context."},{"from":2852.84,"to":2856.05,"location":2,"content":"Things like, part of speech tagging or NER."},{"from":2856.05,"to":2859.39,"location":2,"content":"But then we've gone ahead and looked at some other models."},{"from":2859.39,"to":2865.41,"location":2,"content":"And so, um, CNN's are very good for text classification, um,"},{"from":2865.41,"to":2869.59,"location":2,"content":"and they're very good because they parallelize really well on GPUs,"},{"from":2869.59,"to":2871.84,"location":2,"content":"which is something I'll come back to again later."},{"from":2871.84,"to":2877.51,"location":2,"content":"So they, they just sort- the general sort of representing sentence meaning."},{"from":2877.51,"to":2879.1,"location":2,"content":"They're actually a efficient,"},{"from":2879.1,"to":2882.3,"location":2,"content":"versatile, good method, which has been used quite a bit."},{"from":2882.3,"to":2885.46,"location":2,"content":"And then they sort of contrast with recurrent neural networks."},{"from":2885.46,"to":2887.8,"location":2,"content":"Recurrent neural networks have some advantages."},{"from":2887.8,"to":2890.08,"location":2,"content":"They're sort of more cognitively plausible,"},{"from":2890.08,"to":2892.12,"location":2,"content":"because you're sort of reading through the text and,"},{"from":2892.12,"to":2894.14,"location":2,"content":"um, getting its meaning."},{"from":2894.14,"to":2896.83,"location":2,"content":"Um, recurrent neural networks are good for"},{"from":2896.83,"to":2899.8,"location":2,"content":"things like sequence tagging and classification,"},{"from":2899.8,"to":2903.39,"location":2,"content":"building language models to predict what's coming next."},{"from":2903.39,"to":2906.91,"location":2,"content":"Um, they can do really well when combined with attention."},{"from":2906.91,"to":2909.57,"location":2,"content":"Um, but they also have some disadvantages."},{"from":2909.57,"to":2913.87,"location":2,"content":"They're way slower than convolutional neural networks and if what you wanna"},{"from":2913.87,"to":2918.3,"location":2,"content":"do is get out some kind of overall meaning representation of a sentence,"},{"from":2918.3,"to":2919.84,"location":2,"content":"you know, \"What does this mean?"},{"from":2919.84,"to":2921.38,"location":2,"content":"Are these two, um,"},{"from":2921.38,"to":2923.85,"location":2,"content":"phrases paraphrases with each other?\""},{"from":2923.85,"to":2926.73,"location":2,"content":"There are now many results that show that people"},{"from":2926.73,"to":2929.8,"location":2,"content":"don't get better results with recurrent neural networks."},{"from":2929.8,"to":2935.55,"location":2,"content":"They can get better results using techniques like convolutional neural networks."},{"from":2935.55,"to":2945.01,"location":2,"content":"Okay. [NOISE] So in the next step then [NOISE] is to,"},{"from":2945.01,"to":2949.68,"location":2,"content":"sort of, head towards our com- our complex,"},{"from":2949.68,"to":2952.38,"location":2,"content":"um, convolutional architecture example."},{"from":2952.38,"to":2954.01,"location":2,"content":"So before getting to that,"},{"from":2954.01,"to":2958.53,"location":2,"content":"I just wanna sort of introduce a few concepts that we haven't seen,"},{"from":2958.53,"to":2962.63,"location":2,"content":"all of which, um, start to turn up when we do this."},{"from":2962.63,"to":2966.36,"location":2,"content":"So we spent a lot of time in the sequence models part,"},{"from":2966.36,"to":2972.34,"location":2,"content":"talking about gated models or the gated recurrent units and the LSTM units."},{"from":2972.34,"to":2976.08,"location":2,"content":"But the idea of a gate is general that we can"},{"from":2976.08,"to":2980.13,"location":2,"content":"sort of have this idea that we can calculate something,"},{"from":2980.13,"to":2982.18,"location":2,"content":"put it through, um,"},{"from":2982.18,"to":2987.37,"location":2,"content":"a sigmoid nonlinearity and gets a value between zero and one,"},{"from":2987.37,"to":2990.39,"location":2,"content":"um, or a vector of values between zero and one."},{"from":2990.39,"to":2992.98,"location":2,"content":"And then do a Hadamard product with a vector"},{"from":2992.98,"to":2995.86,"location":2,"content":"and sort of gate it between its value and zero."},{"from":2995.86,"to":2999.49,"location":2,"content":"So that suggests the idea that you could also apply"},{"from":2999.49,"to":3004.11,"location":2,"content":"gates vertically when you're building multilayer networks."},{"from":3004.11,"to":3007.84,"location":2,"content":"And after the successive LSTMs had been proven,"},{"from":3007.84,"to":3011.78,"location":2,"content":"that was, um, an idea that really took off,"},{"from":3011.78,"to":3013.73,"location":2,"content":"was people start exploring,"},{"from":3013.73,"to":3019.45,"location":2,"content":"how can we have, use these ideas of skip connections and gating in a,"},{"from":3019.45,"to":3021.43,"location":2,"content":"in a vertical direction?"},{"from":3021.43,"to":3023.48,"location":2,"content":"And here are two versions of it."},{"from":3023.48,"to":3026.45,"location":2,"content":"This one is a very simple one,"},{"from":3026.45,"to":3030.97,"location":2,"content":"but a very successful one that's basically just about a skip connection."},{"from":3030.97,"to":3036.89,"location":2,"content":"So and this is referred to as a residual block and- which is used in residual networks,"},{"from":3036.89,"to":3038.69,"location":2,"content":"otherwise known as ResNets."},{"from":3038.69,"to":3042.47,"location":2,"content":"Um, so in a residual block, for each block,"},{"from":3042.47,"to":3048.44,"location":2,"content":"you allow a value just to skip ahead to the next, um, layer."},{"from":3048.44,"to":3052.53,"location":2,"content":"Or you can stick it through a conv block,"},{"from":3052.53,"to":3056.82,"location":2,"content":"and the typical conv block is you go through a convolutional layer,"},{"from":3056.82,"to":3059.6,"location":2,"content":"you then go through a ReLU nonlinearity,"},{"from":3059.6,"to":3063.25,"location":2,"content":"another convolutional layer, and then when you come out,"},{"from":3063.25,"to":3065.43,"location":2,"content":"you just sum these two values."},{"from":3065.43,"to":3067.71,"location":2,"content":"So this is the same idea that sort of"},{"from":3067.71,"to":3071.82,"location":2,"content":"summing values is magical in the same way as an LSTM."},{"from":3071.82,"to":3075.16,"location":2,"content":"And then you put the output of that through another ReLU,"},{"from":3075.16,"to":3078.7,"location":2,"content":"and this thing here is called a residual block"},{"from":3078.7,"to":3082.95,"location":2,"content":"and then commonly you'll stack residual blocks on top of each other."},{"from":3082.95,"to":3085.23,"location":2,"content":"Um, there's one little trick here,"},{"from":3085.23,"to":3088.32,"location":2,"content":"um, which is you need to use padding, right?"},{"from":3088.32,"to":3093,"location":2,"content":"Um, because at the end of the day since you want to sum these two pathways,"},{"from":3093,"to":3095.36,"location":2,"content":"you want them to be the same size."},{"from":3095.36,"to":3096.59,"location":2,"content":"And if you, sort of,"},{"from":3096.59,"to":3100.2,"location":2,"content":"have them shrinking in the conv blocks you wouldn't be able to sum them."},{"from":3100.2,"to":3105.12,"location":2,"content":"So you want to, sort of, have a padding at each stage so they stay the same size here,"},{"from":3105.12,"to":3107.44,"location":2,"content":"and so that you can add them together."},{"from":3107.44,"to":3114.5,"location":2,"content":"Um, here's, um, a different version of a block which is"},{"from":3114.5,"to":3117.47,"location":2,"content":"sort of more LSTM-ish and indeed"},{"from":3117.47,"to":3121.71,"location":2,"content":"this block was developed by Jürgen Schmidhuber and students,"},{"from":3121.71,"to":3126,"location":2,"content":"who's the same guy who's behind LSTMs and you can see the same thinking."},{"from":3126,"to":3128.15,"location":2,"content":"It's called a highway block."},{"from":3128.15,"to":3130.8,"location":2,"content":"So in a way it's sort of similar."},{"from":3130.8,"to":3136.08,"location":2,"content":"You've got, you know, kind of thinking of moving an identity x that skips"},{"from":3136.08,"to":3143.09,"location":2,"content":"a nonlinear block or you can have it go through exactly the same stuff conv, relu, conv."},{"from":3143.09,"to":3146.48,"location":2,"content":"The difference is that unlike this one,"},{"from":3146.48,"to":3149.16,"location":2,"content":"this time there's explicit gates so there's,"},{"from":3149.16,"to":3153.29,"location":2,"content":"um, and this T-gate and the C-gate."},{"from":3153.29,"to":3159.23,"location":2,"content":"And so you're multiplying both of the path through here and the path through here"},{"from":3159.23,"to":3162.28,"location":2,"content":"by a gate just kinda like the sort of"},{"from":3162.28,"to":3167.13,"location":2,"content":"the get input gates that we saw before and then summing them together."},{"from":3167.13,"to":3170.67,"location":2,"content":"So that sort of feels more"},{"from":3170.67,"to":3176.28,"location":2,"content":"powerful but it's not actually clear that it is more powerful."},{"from":3176.28,"to":3179.46,"location":2,"content":"I mean, this one actually has a very simple"},{"from":3179.46,"to":3183.07,"location":2,"content":"semantic because if you think of the semantics of this one"},{"from":3183.07,"to":3185.93,"location":2,"content":"is the default is just you walk"},{"from":3185.93,"to":3191.01,"location":2,"content":"this way and you just sort of carry forward your value and do nothing."},{"from":3191.01,"to":3194.9,"location":2,"content":"Um, so, what this block's job to- is to do,"},{"from":3194.9,"to":3198.16,"location":2,"content":"is to learn a delta that is meant to learn"},{"from":3198.16,"to":3201.75,"location":2,"content":"what kind of deviation you have from doing nothing."},{"from":3201.75,"to":3205.21,"location":2,"content":"Um, so that's a nice simple semantic which, um,"},{"from":3205.21,"to":3208.68,"location":2,"content":"seems to work well in neural networks to learn things."},{"from":3208.68,"to":3211.39,"location":2,"content":"Um, this sort of has"},{"from":3211.39,"to":3216.5,"location":2,"content":"more complicated apparent semantics because you're taking, you know,"},{"from":3216.5,"to":3223.01,"location":2,"content":"some parts of the identity multiplying by this sort of gate in a Hadamard product"},{"from":3223.01,"to":3229.88,"location":2,"content":"and some parts of this conv block multiplied by this other gate T in a Hadamard product."},{"from":3229.88,"to":3233.98,"location":2,"content":"So that sort of feels more powerful as that"},{"from":3233.98,"to":3238.32,"location":2,"content":"gives me a lot more control because I can take pieces of the different ones and so on."},{"from":3238.32,"to":3241.62,"location":2,"content":"If you think about it for a bit longer, I mean,"},{"from":3241.62,"to":3245.38,"location":2,"content":"mathematically it's actually not any more powerful that you"},{"from":3245.38,"to":3249.5,"location":2,"content":"can represent anything you can do with this one with that one."},{"from":3249.5,"to":3253.53,"location":2,"content":"And the way to think about that is well, um,"},{"from":3253.53,"to":3259.41,"location":2,"content":"you know, here you're kind of keeping only part of the identity,"},{"from":3259.41,"to":3266.84,"location":2,"content":"um, but what you could do is keep the whole of the identity and see it as your job"},{"from":3266.84,"to":3270.09,"location":2,"content":"to subtract off the bits that this one isn't keeping"},{"from":3270.09,"to":3274.44,"location":2,"content":"over here in the conv block which you can do theoretically."},{"from":3274.44,"to":3279.48,"location":2,"content":"Um, and so, you can sort of anything you can compute with this as a function,"},{"from":3279.48,"to":3282.83,"location":2,"content":"you can actually compute with a, um, ResNet block."},{"from":3282.83,"to":3287.18,"location":2,"content":"Um, and so then as quite often in neural network land,"},{"from":3287.18,"to":3289.33,"location":2,"content":"the question isn't sort of, um,"},{"from":3289.33,"to":3293.19,"location":2,"content":"some kind of proof of compute- can be computed or not."},{"from":3293.19,"to":3298.45,"location":2,"content":"It sort of comes down to learning and regularization questions as to"},{"from":3298.45,"to":3301.34,"location":2,"content":"whether one or the other of these actually proves"},{"from":3301.34,"to":3306.43,"location":2,"content":"better as something to use in a learning architecture."},{"from":3306.43,"to":3309.68,"location":2,"content":"Okay. Second concept."},{"from":3309.68,"to":3311.86,"location":2,"content":"Um, batch normalization."},{"from":3311.86,"to":3317.41,"location":2,"content":"So when people are building deep convolutional neural networks,"},{"from":3317.41,"to":3321.68,"location":2,"content":"um, in the 2015 pluses,"},{"from":3321.68,"to":3327.07,"location":2,"content":"um, they almost always use batch normalization layers because"},{"from":3327.07,"to":3332.68,"location":2,"content":"this makes your life a lot better and if they're not using batch normalization layers,"},{"from":3332.68,"to":3337.07,"location":2,"content":"they're normally using one of the other variant ideas that people have suggested"},{"from":3337.07,"to":3342.16,"location":2,"content":"such as layer normalization which is sort of meant to do about the same thing."},{"from":3342.16,"to":3346.09,"location":2,"content":"Um, so what batch normalization does?"},{"from":3346.09,"to":3350.65,"location":2,"content":"I mean, I think many of you will have seen somewhere in steps or"},{"from":3350.65,"to":3356.3,"location":2,"content":"otherwise the idea of doing a Z-transform which means you take your data,"},{"from":3356.3,"to":3359.1,"location":2,"content":"you work out its mean and you work out its"},{"from":3359.1,"to":3363.97,"location":2,"content":"standard deviation and then you rescale by subtraction and"},{"from":3363.97,"to":3367.71,"location":2,"content":"multiplication so that you have a set of data which"},{"from":3367.71,"to":3372.36,"location":2,"content":"has a mean of zero and a standard deviation of one."},{"from":3372.36,"to":3374.68,"location":2,"content":"Most people see that, right?"},{"from":3374.68,"to":3383.5,"location":2,"content":"Yeah? Um, so batch normalization is effectively doing exactly that but in a weird way."},{"from":3383.5,"to":3387.77,"location":2,"content":"So what you're doing is that you're taking each mini batch."},{"from":3387.77,"to":3391.88,"location":2,"content":"So whatever just random 32 examples you've stuck in a mini batch,"},{"from":3391.88,"to":3394.04,"location":2,"content":"you're running them through a layer of"},{"from":3394.04,"to":3397.36,"location":2,"content":"your neural network like a ConvBlock that we saw before"},{"from":3397.36,"to":3403.19,"location":2,"content":"and you take the output of that mini batch and then you do a Z-transform on it."},{"from":3403.19,"to":3407.3,"location":2,"content":"Um, and then it goes forward into the next ConvBlock or whatever,"},{"from":3407.3,"to":3409.41,"location":2,"content":"and the next time you have a different mini batch,"},{"from":3409.41,"to":3410.99,"location":2,"content":"you just Z-transform it."},{"from":3410.99,"to":3412.29,"location":2,"content":"So it seems a little bit weird."},{"from":3412.29,"to":3416.6,"location":2,"content":"You're just doing it on the output of these mini batches."},{"from":3416.6,"to":3421.68,"location":2,"content":"Um, but that's proven to be a very effective thing to do."},{"from":3421.68,"to":3425.98,"location":2,"content":"So that it sort of means that what comes out of"},{"from":3425.98,"to":3429.89,"location":2,"content":"a ConvBlock sort of always has the same kind of scale."},{"from":3429.89,"to":3433.72,"location":2,"content":"So it doesn't sort of fluctuate a lot and mess things up and it tends to"},{"from":3433.72,"to":3438.22,"location":2,"content":"make the models just much more reliably trainable because,"},{"from":3438.22,"to":3442.86,"location":2,"content":"you know, you just have to be much less fussy about a lot of things."},{"from":3442.86,"to":3445.51,"location":2,"content":"Because, you know, a lot of the things we've talked about,"},{"from":3445.51,"to":3448.18,"location":2,"content":"about initializing your parameters and"},{"from":3448.18,"to":3451.13,"location":2,"content":"setting your learning rates is sort of about, well,"},{"from":3451.13,"to":3454.31,"location":2,"content":"you have to keep the scale of things about right so they don't get"},{"from":3454.31,"to":3457.81,"location":2,"content":"too big or too small and things like that."},{"from":3457.81,"to":3460.28,"location":2,"content":"Whereas, if you're doing this batch normalization,"},{"from":3460.28,"to":3462.49,"location":2,"content":"you're sort of forcing scale,"},{"from":3462.49,"to":3465.7,"location":2,"content":"um, to being the same size each time."},{"from":3465.7,"to":3468.37,"location":2,"content":"And s o therefore, you kind of don't have to do"},{"from":3468.37,"to":3471.2,"location":2,"content":"the other stuff as well and it still tends to,"},{"from":3471.2,"to":3472.71,"location":2,"content":"um, work pretty well."},{"from":3472.71,"to":3475.69,"location":2,"content":"So that's a good technique to know about."},{"from":3475.69,"to":3479.8,"location":2,"content":"Okay. Um, one last thing to learn about."},{"from":3479.8,"to":3482.07,"location":2,"content":"Um, there's a concept of,"},{"from":3482.07,"to":3487.01,"location":2,"content":"um, size one convolutions."},{"from":3487.01,"to":3491.24,"location":2,"content":"Um, and actually, I guess I really sort of, um,"},{"from":3491.24,"to":3494.68,"location":2,"content":"renamed it- I named this wrong because I wrote down"},{"from":3494.68,"to":3498.24,"location":2,"content":"one by one convolutions because that's the term you normally see."},{"from":3498.24,"to":3502.53,"location":2,"content":"But that's, um, the vision world where you have 2D convolutions."},{"from":3502.53,"to":3506.14,"location":2,"content":"So I guess I should have just called this one convolutions."},{"from":3506.14,"to":3508.89,"location":2,"content":"So you can have convolutions, um,"},{"from":3508.89,"to":3513.07,"location":2,"content":"with a kernel size of one and when you first see that,"},{"from":3513.07,"to":3517.84,"location":2,"content":"it seems like that makes no sense whatsoever because the whole idea"},{"from":3517.84,"to":3523.3,"location":2,"content":"of a convolution was I was taking this patch and calculating something from it."},{"from":3523.3,"to":3528.33,"location":2,"content":"If I'm not looking at any other words,"},{"from":3528.33,"to":3530.51,"location":2,"content":"surely I'm calculating nothing."},{"from":3530.51,"to":3534.97,"location":2,"content":"But what actually happens in the size one convolution,"},{"from":3534.97,"to":3539.16,"location":2,"content":"is if you have a number of channels that"},{"from":3539.16,"to":3543.85,"location":2,"content":"sort of in a previous layer if you'd calculated whatever it was,"},{"from":3543.85,"to":3546.61,"location":2,"content":"32 channels or something like that."},{"from":3546.61,"to":3551.07,"location":2,"content":"What the one by one convolution is doing is acting as"},{"from":3551.07,"to":3556.63,"location":2,"content":"a tiny little embedded fully-connected network over those channels."},{"from":3556.63,"to":3558.91,"location":2,"content":"And so you're sort of doing a"},{"from":3558.91,"to":3562.28,"location":2,"content":"position specific fully-connected network,"},{"from":3562.28,"to":3566.39,"location":2,"content":"um, in- for each row of your data."},{"from":3566.39,"to":3568.05,"location":2,"content":"And so you can do that,"},{"from":3568.05,"to":3569.59,"location":2,"content":"um, for various reasons."},{"from":3569.59,"to":3571.92,"location":2,"content":"You can do it because you want to map down from having"},{"from":3571.92,"to":3574.87,"location":2,"content":"a lot of channels to having fewer channels or"},{"from":3574.87,"to":3577.46,"location":2,"content":"you can do it just because you think another non-linearity"},{"from":3577.46,"to":3580.34,"location":2,"content":"will help and this is a really cheap way to do it."},{"from":3580.34,"to":3584.15,"location":2,"content":"Because the crucial thing to notice is that if you sort"},{"from":3584.15,"to":3587.99,"location":2,"content":"of put fully-connected layers over everything,"},{"from":3587.99,"to":3592.93,"location":2,"content":"they involve a lot of parameters whereas putting in these size"},{"from":3592.93,"to":3596.65,"location":2,"content":"one convolutions involve very few parameters"},{"from":3596.65,"to":3600.67,"location":2,"content":"because you're just doing it at the level of a single word."},{"from":3600.67,"to":3603.76,"location":2,"content":"Um, okay."},{"from":3603.76,"to":3608.59,"location":2,"content":"Um, two random things and then I'll go onto my complex model."},{"from":3608.59,"to":3610.54,"location":2,"content":"Um, this is just a sort of"},{"from":3610.54,"to":3613.66,"location":2,"content":"almost a bias- aside but it just shows"},{"from":3613.66,"to":3617.11,"location":2,"content":"something different that you could do and it's something that you could play with."},{"from":3617.11,"to":3620.07,"location":2,"content":"I mean, when we talked about machine translation,"},{"from":3620.07,"to":3624.5,"location":2,"content":"we talk about the SIC to SIC architecture that was introduced in"},{"from":3624.5,"to":3629.93,"location":2,"content":"2014 and has been very successful for machine translation."},{"from":3629.93,"to":3632.68,"location":2,"content":"But actually, the year before that came out,"},{"from":3632.68,"to":3634.86,"location":2,"content":"um, there was a paper, um,"},{"from":3634.86,"to":3641.26,"location":2,"content":"doing neural machine translation by Nal Kalchbrenner and Phil Blunsom in the UK."},{"from":3641.26,"to":3644.01,"location":2,"content":"And this sort of was actually essentially"},{"from":3644.01,"to":3648.84,"location":2,"content":"the first neural machine translation paper of the modern era."},{"from":3648.84,"to":3650.4,"location":2,"content":"If you dig back far enough,"},{"from":3650.4,"to":3652.13,"location":2,"content":"there are actually a couple of people that tried to use"},{"from":3652.13,"to":3654.14,"location":2,"content":"neural networks for machine translation"},{"from":3654.14,"to":3658.45,"location":2,"content":"in the '80s and '90s but this was sort of the first one that restarted it,"},{"from":3658.45,"to":3662.2,"location":2,"content":"and they didn't actually use a SIC to SIC architecture."},{"from":3662.2,"to":3665.69,"location":2,"content":"So what they used was for the encoder,"},{"from":3665.69,"to":3668.49,"location":2,"content":"they used the convolutional neural networks."},{"from":3668.49,"to":3673.43,"location":2,"content":"And so that they had a stack of convolutional neural networks that progressively shrunk"},{"from":3673.43,"to":3678.76,"location":2,"content":"down the input and then finally pulled it to get a sentence representation,"},{"from":3678.76,"to":3682.96,"location":2,"content":"and then they used a sequence model as the decoder."},{"from":3682.96,"to":3686.88,"location":2,"content":"Um, so, um, that's sort of something that you could"},{"from":3686.88,"to":3690.52,"location":2,"content":"try in some other applications that for encoders,"},{"from":3690.52,"to":3693.8,"location":2,"content":"it's really easy to use convolutional neural networks."},{"from":3693.8,"to":3699.18,"location":2,"content":"There has been work on using convolutional neural networks as decoders as well,"},{"from":3699.18,"to":3704.41,"location":2,"content":"though that's a little bit harder to get your brain around and isn't used nearly as much."},{"from":3704.41,"to":3710.96,"location":2,"content":"Then the second thing I want to mention because we'll turn to it in just a minute is so,"},{"from":3710.96,"to":3717.3,"location":2,"content":"so far we've done Convolutional models over words so that"},{"from":3717.3,"to":3720.89,"location":2,"content":"our kernels are effectively picking up"},{"from":3720.89,"to":3726.05,"location":2,"content":"these word n-gram units of two-word or three word sub-sequences."},{"from":3726.05,"to":3730.19,"location":2,"content":"And the idea that then developed fairly soon was well maybe"},{"from":3730.19,"to":3734.7,"location":2,"content":"it would also be useful to use convolutions over characters."},{"from":3734.7,"to":3737.11,"location":2,"content":"So, you could run a convolutional neural network"},{"from":3737.11,"to":3739.97,"location":2,"content":"over the characters of the word to try and,"},{"from":3739.97,"to":3742.64,"location":2,"content":"um, generate a word embedding, um,"},{"from":3742.64,"to":3745.76,"location":2,"content":"and this idea has been explored quite a lot, um,"},{"from":3745.76,"to":3748.51,"location":2,"content":"it's part of what you guys are gonna do for assignment"},{"from":3748.51,"to":3751.72,"location":2,"content":"five is build a character level ConvNet,"},{"from":3751.72,"to":3755.18,"location":2,"content":"um, for your improved machine translation system."},{"from":3755.18,"to":3760.25,"location":2,"content":"I'm not going to say sort of a huge amount about the foundations of this today, um,"},{"from":3760.25,"to":3764.27,"location":2,"content":"because Thursday's lecture is then talking about subword models"},{"from":3764.27,"to":3769.05,"location":2,"content":"and we'll go through all the details of different subword models."},{"from":3769.05,"to":3773.3,"location":2,"content":"But, I wanted to show you a con- a complex"},{"from":3773.3,"to":3778.01,"location":2,"content":"convolutional neural network which is also used for text classification."},{"from":3778.01,"to":3781.68,"location":2,"content":"So, essentially, the same task as Yoon Kim's model"},{"from":3781.68,"to":3786.23,"location":2,"content":"and this model actually is built on characters,"},{"from":3786.23,"to":3787.7,"location":2,"content":"it's not built on words."},{"from":3787.7,"to":3790.64,"location":2,"content":"So, we are at the foundation of it,"},{"from":3790.64,"to":3793.14,"location":2,"content":"um, having a word-like model."},{"from":3793.14,"to":3796.78,"location":2,"content":"Um, so, this is a paper from 2017,"},{"from":3796.78,"to":3801.35,"location":2,"content":"um, by, um, the four authors shown here, um,"},{"from":3801.35,"to":3804.17,"location":2,"content":"people working at Facebook AI Research,"},{"from":3804.17,"to":3807.64,"location":2,"content":"um, in France, um, and so,"},{"from":3807.64,"to":3810.32,"location":2,"content":"they kind of had an interesting hypothesis for"},{"from":3810.32,"to":3814.2,"location":2,"content":"this paper which was essentially to say, that, you know,"},{"from":3814.2,"to":3822.53,"location":2,"content":"by 2017 people who are using deep learning for vision were building really,"},{"from":3822.53,"to":3827.6,"location":2,"content":"really deep networks and fi- finding that they work much,"},{"from":3827.6,"to":3829.79,"location":2,"content":"much better for vision tasks."},{"from":3829.79,"to":3832.2,"location":2,"content":"So, essentially to some extend,"},{"from":3832.2,"to":3838.49,"location":2,"content":"the breakthrough was these guys that once these ideas that emerged,"},{"from":3838.49,"to":3844.45,"location":2,"content":"it then proved that it wasn't just that you could build a six layer or an eight layer,"},{"from":3844.45,"to":3847.58,"location":2,"content":"um, Convolutional Neural Network for vision tasks."},{"from":3847.58,"to":3849.2,"location":2,"content":"You could start building really,"},{"from":3849.2,"to":3854.27,"location":2,"content":"really deep networks for vision tasks which had tens or even hundreds of"},{"from":3854.27,"to":3861.21,"location":2,"content":"layers and that those models when trained on a lot of data proved to work even better."},{"from":3861.21,"to":3867.11,"location":2,"content":"So, um, if that's what's in your head and you then looked,"},{"from":3867.11,"to":3873.97,"location":2,"content":"look at what was and indeed is happening in natural language processing,"},{"from":3873.97,"to":3876.41,"location":2,"content":"the observation is, you know,"},{"from":3876.41,"to":3878.39,"location":2,"content":"these NLP people are kind of pathetic,"},{"from":3878.39,"to":3883.55,"location":2,"content":"they claim they're doing deep learning but they're still working with three layer LSTMs."},{"from":3883.55,"to":3886.47,"location":2,"content":"Surely, we can make some progress, um,"},{"from":3886.47,"to":3893.74,"location":2,"content":"by building really deep networks that kinda look like vision networks and using them,"},{"from":3893.74,"to":3897.03,"location":2,"content":"um, for natural language processing goals."},{"from":3897.03,"to":3901.41,"location":2,"content":"And so, that is precisely what they said about doing."},{"from":3901.41,"to":3908.93,"location":2,"content":"So, that they designed and built really deep network which sort of looks like a vision stack,"},{"from":3908.93,"to":3914.9,"location":2,"content":"um, as a convolutional neural network that is built over characters."},{"from":3914.9,"to":3920.66,"location":2,"content":"Um, so, I've got the picture of it here but sufficiently deep that it's fitting it on"},{"from":3920.66,"to":3923.39,"location":2,"content":"the slide and making it readable [LAUGHTER] is a little bit"},{"from":3923.39,"to":3926.15,"location":2,"content":"of a challenge but we can try and look at this."},{"from":3926.15,"to":3927.26,"location":2,"content":"So, at the bottom,"},{"from":3927.26,"to":3929.24,"location":2,"content":"we have the text, um,"},{"from":3929.24,"to":3933.97,"location":2,"content":"which is a sequence of characters and so, um,"},{"from":3933.97,"to":3936.98,"location":2,"content":"for the text, um, so,"},{"from":3936.98,"to":3940.64,"location":2,"content":"when people do vision object recognition on"},{"from":3940.64,"to":3944.93,"location":2,"content":"pictures normally all the pictures are made the same size."},{"from":3944.93,"to":3950.22,"location":2,"content":"Right. You make every picture 300 pixels by 300 pixels or something like that."},{"from":3950.22,"to":3953.38,"location":2,"content":"So, they do exactly the same for NLP, um,"},{"from":3953.38,"to":3955.49,"location":2,"content":"they have a size, um,"},{"from":3955.49,"to":3959.69,"location":2,"content":"for their document which is 1024 characters."},{"from":3959.69,"to":3963.71,"location":2,"content":"If it's longer than that they truncate it and keep the first part."},{"from":3963.71,"to":3966.47,"location":2,"content":"If it's shorter than that they pad it until it's of"},{"from":3966.47,"to":3971.32,"location":2,"content":"size 1024 and then they're gonna stick it into their stack."},{"from":3971.32,"to":3975.44,"location":2,"content":"So, the first part is that for each character,"},{"from":3975.44,"to":3978.2,"location":2,"content":"they're going to learn a character embedding now and"},{"from":3978.2,"to":3982.14,"location":2,"content":"their character embeddings are of dimensionality 16."},{"from":3982.14,"to":3989.54,"location":2,"content":"So, that the piece of text is now 16 by 1024, um, so,"},{"from":3989.54,"to":3993.77,"location":2,"content":"they're going to stick that through a convolutional layer where"},{"from":3993.77,"to":3998.21,"location":2,"content":"you've got kernel size of three and 64 output channels."},{"from":3998.21,"to":4004.15,"location":2,"content":"So you now have something that's 64 times of 1024 in size."},{"from":4004.15,"to":4007.9,"location":2,"content":"You now stick this through a convolutional block."},{"from":4007.9,"to":4012.09,"location":2,"content":"I'll explain the details of that convolutional block on the next slide but,"},{"from":4012.09,"to":4016.36,"location":2,"content":"you should be thinking of that ResNet picture I showed earlier where you"},{"from":4016.36,"to":4021.31,"location":2,"content":"can either be going through some convolutions or taking this optional shortcut."},{"from":4021.31,"to":4025.18,"location":2,"content":"Another ResNet, another residual block"},{"from":4025.18,"to":4028.76,"location":2,"content":"where you can be going through convolutions are an optional shortcut,"},{"from":4028.76,"to":4035.02,"location":2,"content":"um, they're then doing local pooling in the same way people typically do envision."},{"from":4035.02,"to":4037.99,"location":2,"content":"So, commonly what people do in vision systems"},{"from":4037.99,"to":4041.53,"location":2,"content":"is you are sort of shrinking the size of the images, um,"},{"from":4041.53,"to":4045.82,"location":2,"content":"by doing pooling that halves the dimensions in each direction."},{"from":4045.82,"to":4047.02,"location":2,"content":"But, at the same time,"},{"from":4047.02,"to":4049.01,"location":2,"content":"you do that in your neural network,"},{"from":4049.01,"to":4051.72,"location":2,"content":"you expand the number of channels,"},{"from":4051.72,"to":4054.13,"location":2,"content":"and so you make it deeper in terms of the number of"},{"from":4054.13,"to":4058.11,"location":2,"content":"channels at the same time as you make it smaller in the x,"},{"from":4058.11,"to":4059.71,"location":2,"content":"y size of the image."},{"from":4059.71,"to":4064.12,"location":2,"content":"So, they do exactly the same apart from these one-dimensional convolutions."},{"from":4064.12,"to":4069.76,"location":2,"content":"So, before we had 64 channels in our 1024 character,"},{"from":4069.76,"to":4074.43,"location":2,"content":"um, embedding, um, document."},{"from":4074.43,"to":4077.11,"location":2,"content":"So, now we pool it, um, so,"},{"from":4077.11,"to":4083.61,"location":2,"content":"we're going to have 512 positions which are sort of like pairs of characters,"},{"from":4083.61,"to":4086.44,"location":2,"content":"um, but we now have 128 channels"},{"from":4086.44,"to":4089.38,"location":2,"content":"and then they kind of repeat that over and over again, right?"},{"from":4089.38,"to":4091.69,"location":2,"content":"So, there are two more convolutional blocks which I'll"},{"from":4091.69,"to":4094.28,"location":2,"content":"explain more but they're sort of residual blocks."},{"from":4094.28,"to":4097.96,"location":2,"content":"They pool it again and they do exactly the same thing."},{"from":4097.96,"to":4101.31,"location":2,"content":"So, now there are 256, um,"},{"from":4101.31,"to":4106.9,"location":2,"content":"positions which are like four character blocks and they have 256 channels,"},{"from":4106.9,"to":4111.46,"location":2,"content":"um, I can't point high enough but they repeat that again and they pool again."},{"from":4111.46,"to":4113.59,"location":2,"content":"So, now they've got, um,"},{"from":4113.59,"to":4116.71,"location":2,"content":"128 positions which are about eight characters"},{"from":4116.71,"to":4120.77,"location":2,"content":"each and they have 512 channels representing that."},{"from":4120.77,"to":4125.08,"location":2,"content":"They pool again, they have convolutional blocks again, um,"},{"from":4125.08,"to":4127.57,"location":2,"content":"then lo and behold because I said that even the"},{"from":4127.57,"to":4130.06,"location":2,"content":"weird ideas are going to turn up, right up there,"},{"from":4130.06,"to":4135.32,"location":2,"content":"they're doing k max pooling and they're keeping the eight strongest values,"},{"from":4135.32,"to":4137.29,"location":2,"content":"um, in each channel."},{"from":4137.29,"to":4139.3,"location":2,"content":"Um, and so at that point,"},{"from":4139.3,"to":4145.19,"location":2,"content":"they've got something of size 512 by eight, um, so,"},{"from":4145.19,"to":4148.51,"location":2,"content":"sort of like eight of the eight character sequences"},{"from":4148.51,"to":4151.7,"location":2,"content":"have been deemed important to the classification and they're"},{"from":4151.7,"to":4155.45,"location":2,"content":"kept but they sort per channel and there are 512 of them"},{"from":4155.45,"to":4159.48,"location":2,"content":"you're then putting that through three fully connected layers."},{"from":4159.48,"to":4162.19,"location":2,"content":"So, typically vision systems at the top"},{"from":4162.19,"to":4165.35,"location":2,"content":"have a couple of fully connected layers at the end,"},{"from":4165.35,"to":4168.06,"location":2,"content":"um, and the very last one of those,"},{"from":4168.06,"to":4171.84,"location":2,"content":"is effectively sort of feeding into your Softmax."},{"from":4171.84,"to":4176.08,"location":2,"content":"So, it's size 2,048 times the number of"},{"from":4176.08,"to":4181.33,"location":2,"content":"classes which might just be positive negative two class unlike the topical classes."},{"from":4181.33,"to":4184,"location":2,"content":"Um, so, yeah, so it's essentially like"},{"from":4184,"to":4187.18,"location":2,"content":"a vision stack but they're going to use it for language."},{"from":4187.18,"to":4188.89,"location":2,"content":"Um, okay."},{"from":4188.89,"to":4192.34,"location":2,"content":"So, the bit that I hand quite explained was"},{"from":4192.34,"to":4197.52,"location":2,"content":"these convolutional blocks but it sort of looks like the picture that we had before or,"},{"from":4197.52,"to":4199.98,"location":2,"content":"um, departments slightly more complicated."},{"from":4199.98,"to":4202.42,"location":2,"content":"So you're doing, um,"},{"from":4202.42,"to":4205.84,"location":2,"content":"a convolutional block of size three"},{"from":4205.84,"to":4210.43,"location":2,"content":"convolutions some number of channels depending on where you are in the sequence."},{"from":4210.43,"to":4213.49,"location":2,"content":"You're then putting it through a batch norm as we just"},{"from":4213.49,"to":4217.07,"location":2,"content":"talked about putting it through a ReLu non-linearity,"},{"from":4217.07,"to":4221.32,"location":2,"content":"repeating all those three things again or remember there"},{"from":4221.32,"to":4225.55,"location":2,"content":"was this sort of skipped connection that went right around the outside of this block."},{"from":4225.55,"to":4231.19,"location":2,"content":"And so this is sort of a residual style block, um, so,"},{"from":4231.19,"to":4234.55,"location":2,"content":"that's the kind of complex architecture you can put together and"},{"from":4234.55,"to":4238.68,"location":2,"content":"try in your final projects if you dare in PyTorch."},{"from":4238.68,"to":4242.77,"location":2,"content":"Um, yeah, um, so,"},{"from":4242.77,"to":4246.09,"location":2,"content":"for experiments so- so one of"},{"from":4246.09,"to":4252.57,"location":2,"content":"the things that they were interested in and wanted to make a point of is well some"},{"from":4252.57,"to":4255.67,"location":2,"content":"of these traditional sentence and"},{"from":4255.67,"to":4258.97,"location":2,"content":"text classification datasets have been used in other papers"},{"from":4258.97,"to":4262.47,"location":2,"content":"like Yoon Kim's paper are effectively quite small."},{"from":4262.47,"to":4270.55,"location":2,"content":"So, something like that Rotten Tomatoes dataset is actually only 10,000 examples, 5,000,"},{"from":4270.55,"to":4273.55,"location":2,"content":"positive 5,000 negative and they sort of have"},{"from":4273.55,"to":4277.18,"location":2,"content":"the idea that just like ImageNet was needed for"},{"from":4277.18,"to":4280.44,"location":2,"content":"deep learning models to really show their worth and vision"},{"from":4280.44,"to":4284.15,"location":2,"content":"that probably does show the value of a huge model like that."},{"from":4284.15,"to":4288.07,"location":2,"content":"Um, you need to have really big datasets."},{"from":4288.07,"to":4289.85,"location":2,"content":"So, they get some much bigger,"},{"from":4289.85,"to":4292,"location":2,"content":"um, text classification datasets."},{"from":4292,"to":4296.06,"location":2,"content":"So, here's an Amazon review positive-negative dataset, um,"},{"from":4296.06,"to":4299.5,"location":2,"content":"with which they have sort of 3.6 million documents,"},{"from":4299.5,"to":4303.03,"location":2,"content":"um, Yelp reviews 650,000 documents."},{"from":4303.03,"to":4305.1,"location":2,"content":"So much bigger datasets,"},{"from":4305.1,"to":4308.23,"location":2,"content":"um, and here are their experiments."},{"from":4308.23,"to":4310.93,"location":2,"content":"Okay. So, the numbers at the top, uh,"},{"from":4310.93,"to":4315.94,"location":2,"content":"for the different datasets of the best previous result printed in the literature,"},{"from":4315.94,"to":4318.64,"location":2,"content":"and then if you read the, um,"},{"from":4318.64,"to":4323.2,"location":2,"content":"footnotes, um, there are a few things that they want to sort of star."},{"from":4323.2,"to":4327.04,"location":2,"content":"So, the ones that have a star next to them use"},{"from":4327.04,"to":4333.23,"location":2,"content":"an external thesaurus which they don't use. [NOISE]"},{"from":4333.23,"to":4335.64,"location":2,"content":"And the Yang method, um,"},{"from":4335.64,"to":4338.61,"location":2,"content":"use some special techniques as well that I cut off."},{"from":4338.61,"to":4341.58,"location":2,"content":"Um, and the other thing to mention is these numbers,"},{"from":4341.58,"to":4344.18,"location":2,"content":"they're error rates, so low is good."},{"from":4344.18,"to":4346.41,"location":2,"content":"Um, so the lower you get them, the better."},{"from":4346.41,"to":4350.94,"location":2,"content":"And so then these are all of their results."},{"from":4350.94,"to":4354.77,"location":2,"content":"Um, and so what can you get out of these results?"},{"from":4354.77,"to":4359.55,"location":2,"content":"Um, well, the first thing that you can notice is basically with these results,"},{"from":4359.55,"to":4362.1,"location":2,"content":"the deeper networks are working better, right?"},{"from":4362.1,"to":4364.85,"location":2,"content":"So, the one I showed you,"},{"from":4364.85,"to":4368.49,"location":2,"content":"uh, well, no, I think the one that I have the picture of this isn't the full thing."},{"from":4368.49,"to":4372.72,"location":2,"content":"Um, but they have ones with depth 9, 17,"},{"from":4372.72,"to":4376.68,"location":2,"content":"and 29 in terms of the number of convolutional layers,"},{"from":4376.68,"to":4381.15,"location":2,"content":"and the deepest one is always the one that's working best."},{"from":4381.15,"to":4384.26,"location":2,"content":"So, that's a proof of deep networks."},{"from":4384.26,"to":4387.57,"location":2,"content":"Um, that didn't keep on working, um,"},{"from":4387.57,"to":4390.69,"location":2,"content":"so an interesting footnote here is,"},{"from":4390.69,"to":4391.94,"location":2,"content":"um, I guess they thought,"},{"from":4391.94,"to":4393.23,"location":2,"content":"oh, this is cool."},{"from":4393.23,"to":4399.31,"location":2,"content":"Why don't we try an even deeper one that has 47 layers and see how well that works?"},{"from":4399.31,"to":4403.64,"location":2,"content":"And, I mean, the results were sort of interesting for that."},{"from":4403.64,"to":4406.13,"location":2,"content":"So, for the 47 layer one,"},{"from":4406.13,"to":4408.85,"location":2,"content":"it worked a fraction worse than this one."},{"from":4408.85,"to":4412.05,"location":2,"content":"Um, so in one sense you,"},{"from":4412.05,"to":4417.9,"location":2,"content":"they showed the result of sort of residual layers work really well."},{"from":4417.9,"to":4420.7,"location":2,"content":"So, they did an experiment of let's try to train"},{"from":4420.7,"to":4425.32,"location":2,"content":"a 47-layer network without using residual connections."},{"from":4425.32,"to":4427.45,"location":2,"content":"And, well, it was a lot worse."},{"from":4427.45,"to":4429.87,"location":2,"content":"The numbers went down about two percent."},{"from":4429.87,"to":4432.82,"location":2,"content":"And they trained one with residual connections,"},{"from":4432.82,"to":4438.87,"location":2,"content":"and the fact of the matter is the numbers were just a teeny weeny bit worse."},{"from":4438.87,"to":4442.48,"location":2,"content":"They were sort of 0.1 of a percent worse."},{"from":4442.48,"to":4445.52,"location":2,"content":"So, you know, they sort of work just about as well."},{"from":4445.52,"to":4450.3,"location":2,"content":"But, nevertheless, that's kind of different to the situation in vision,"},{"from":4450.3,"to":4455.15,"location":2,"content":"because for the sort of residual networks that people are using in vision,"},{"from":4455.15,"to":4459.99,"location":2,"content":"this is sort of like the very minimum depth that people use."},{"from":4459.99,"to":4463.48,"location":2,"content":"So, if you're using residual networks in vision typically,"},{"from":4463.48,"to":4465.91,"location":2,"content":"you might use ResNet-34."},{"from":4465.91,"to":4469.22,"location":2,"content":"If you're really short on memory and want to have a small model,"},{"from":4469.22,"to":4472.98,"location":2,"content":"but you just know you'd get better results if you used ResNet-50,"},{"from":4472.98,"to":4476.73,"location":2,"content":"and in fact, if you used ResNet-101 it'd work even better again."},{"from":4476.73,"to":4479.63,"location":2,"content":"Um, and so that somehow, you know,"},{"from":4479.63,"to":4481.41,"location":2,"content":"whether it's got to do with the different nature of"},{"from":4481.41,"to":4484.35,"location":2,"content":"language or the amounts of data or something,"},{"from":4484.35,"to":4487.91,"location":2,"content":"you haven't yet gone to the same depth that you can in vision."},{"from":4487.91,"to":4490.62,"location":2,"content":"Um, but other results, um,"},{"from":4490.62,"to":4494.19,"location":2,"content":"so the other thing they're comparing here is that they're comparing"},{"from":4494.19,"to":4499.24,"location":2,"content":"three different ways of sort of stringing things down."},{"from":4499.24,"to":4502.95,"location":2,"content":"So, you could be using, um,"},{"from":4502.95,"to":4506.72,"location":2,"content":"the stride in the Convolution,"},{"from":4506.72,"to":4509.74,"location":2,"content":"you can be using local MaxPooling,"},{"from":4509.74,"to":4512.81,"location":2,"content":"and you could be using KMaxPooling."},{"from":4512.81,"to":4514.35,"location":2,"content":"Um, and they're general,"},{"from":4514.35,"to":4516.94,"location":2,"content":"they're slightly different numbers as you can see."},{"from":4516.94,"to":4519.99,"location":2,"content":"Each one, um, wins and one, uh,"},{"from":4519.99,"to":4523.89,"location":2,"content":"at least one of these datasets or actually at least two of these datasets."},{"from":4523.89,"to":4527.43,"location":2,"content":"But not only does MaxPooling win for four of the datasets,"},{"from":4527.43,"to":4529.89,"location":2,"content":"if you sort of look at the numbers,"},{"from":4529.89,"to":4532.24,"location":2,"content":"MaxPooling always does pretty well."},{"from":4532.24,"to":4534.49,"location":2,"content":"Because MaxPooling does pretty well here,"},{"from":4534.49,"to":4538.14,"location":2,"content":"whereas the convolutional stride works badly,"},{"from":4538.14,"to":4541.6,"location":2,"content":"and over here MaxPooling works pretty well,"},{"from":4541.6,"to":4545.69,"location":2,"content":"and the, um, KMaxPooling works kind of badly."},{"from":4545.69,"to":4550.89,"location":2,"content":"So, their recommendation at the end of the day is you should always use, um,"},{"from":4550.89,"to":4553.68,"location":2,"content":"just MaxPooling of a simple kind,"},{"from":4553.68,"to":4555.39,"location":2,"content":"that that seems to be fine,"},{"from":4555.39,"to":4557.34,"location":2,"content":"um, and nothing else."},{"from":4557.34,"to":4561.31,"location":2,"content":"Um, it's actually worth the trouble of thinking about doing."},{"from":4561.31,"to":4570.54,"location":2,"content":"Okay. Um, was there any other conclusions I wanted to say?"},{"from":4570.54,"to":4573.28,"location":2,"content":"Okay. Um, I think that was most of that."},{"from":4573.28,"to":4577.44,"location":2,"content":"I guess their overall message is you can build super good, um,"},{"from":4577.44,"to":4580.47,"location":2,"content":"text classification systems using ConvNets,"},{"from":4580.47,"to":4582.63,"location":2,"content":"and you should take away that message."},{"from":4582.63,"to":4586.17,"location":2,"content":"Okay. So, there are just a couple of minutes left."},{"from":4586.17,"to":4590.15,"location":2,"content":"There was sort of one other thing that I wanted to mention,"},{"from":4590.15,"to":4593.51,"location":2,"content":"but I think I'll just sort of mention it very quickly,"},{"from":4593.51,"to":4596.51,"location":2,"content":"and you can look in more detail if you want to."},{"from":4596.51,"to":4598.78,"location":2,"content":"So, we sort of have this situation"},{"from":4598.78,"to":4604.06,"location":2,"content":"that re- recurrent neural networks are a very standard building block for NLP,"},{"from":4604.06,"to":4609.4,"location":2,"content":"but they have this big problem that they just don't parallelize well."},{"from":4609.4,"to":4613.8,"location":2,"content":"And the way we get fast computation deep learning is we find"},{"from":4613.8,"to":4618.18,"location":2,"content":"things that parallelize well so that we can stick them on GPUs."},{"from":4618.18,"to":4625.43,"location":2,"content":"GPUs only are fast if they can be simultaneously doing the same computation many times,"},{"from":4625.43,"to":4628.44,"location":2,"content":"which is sort of trivial for a convolutional neural network,"},{"from":4628.44,"to":4633.23,"location":2,"content":"because precisely, you're doing the same comput- computation every position."},{"from":4633.23,"to":4637.74,"location":2,"content":"But that's not what's happening in the recurrent neural network because you have to"},{"from":4637.74,"to":4639.99,"location":2,"content":"work out the value of position one"},{"from":4639.99,"to":4642.93,"location":2,"content":"before you can start to calculate the value of position two,"},{"from":4642.93,"to":4646.32,"location":2,"content":"which is used for the value of position three."},{"from":4646.32,"to":4648.98,"location":2,"content":"Um, so this was a piece of work, um,"},{"from":4648.98,"to":4653.03,"location":2,"content":"done by sometimes CS224N co-instructor"},{"from":4653.03,"to":4657.62,"location":2,"content":"Richard Socher and some of his people at Salesforce Research"},{"from":4657.62,"to":4660.11,"location":2,"content":"on saying, how can we get the best of both worlds?"},{"from":4660.11,"to":4663.48,"location":2,"content":"How can we get something that's kind of like a"},{"from":4663.48,"to":4669.65,"location":2,"content":"recurrent neural network, but doesn't have the bad computational properties?"},{"from":4669.65,"to":4673.16,"location":2,"content":"And so the idea that they had was, well,"},{"from":4673.16,"to":4680.55,"location":2,"content":"rather than doing the standard LSTM style thing where you're calculating, you know,"},{"from":4680.55,"to":4687.09,"location":2,"content":"an updated candidate value and your gates in terms of the preceding time slice,"},{"from":4687.09,"to":4693.51,"location":2,"content":"maybe what instead we could do is we could stick a relation between time"},{"from":4693.51,"to":4700.15,"location":2,"content":"minus 1 and time into the MaxPooling layer of a convolutional neural network."},{"from":4700.15,"to":4706.26,"location":2,"content":"So, we're sort of calculating a candidate and a forget gate and an output gate."},{"from":4706.26,"to":4710.7,"location":2,"content":"But these, these candidate and the, um,"},{"from":4710.7,"to":4718.5,"location":2,"content":"gated values are done inside the pooling layer via compute,"},{"from":4718.5,"to":4724.35,"location":2,"content":"um, via, um, uh, uh, convolutional operation."},{"from":4724.35,"to":4726.15,"location":2,"content":"So, it sort of get,"},{"from":4726.15,"to":4727.65,"location":2,"content":"it doesn't, it, you know,"},{"from":4727.65,"to":4733.06,"location":2,"content":"if there's no free lunch you can't get true recurrence and not pay the penalty."},{"from":4733.06,"to":4736.76,"location":2,"content":"This is giving you sort of a pseudo-recurrence because you are"},{"from":4736.76,"to":4742.24,"location":2,"content":"modeling an association between adjacent elements at each time slice,"},{"from":4742.24,"to":4746.31,"location":2,"content":"but it's sort of just worked out locally rather than being carried forward,"},{"from":4746.31,"to":4748.2,"location":2,"content":"um, in one layer."},{"from":4748.2,"to":4750.24,"location":2,"content":"But sort of what they found is,"},{"from":4750.24,"to":4754.32,"location":2,"content":"if you made your networks deeper using this idea,"},{"from":4754.32,"to":4755.97,"location":2,"content":"well then, you sort of start to, again,"},{"from":4755.97,"to":4758.01,"location":2,"content":"expand your window of influence."},{"from":4758.01,"to":4762.09,"location":2,"content":"So, you got a certain amount of information being carried forward."},{"from":4762.09,"to":4765.33,"location":2,"content":"Um, so, their conclusions was that you could sort of"},{"from":4765.33,"to":4768.87,"location":2,"content":"build these kind of models and get them to work,"},{"from":4768.87,"to":4772.05,"location":2,"content":"you know, not necessarily better actually on this slide,"},{"from":4772.05,"to":4773.63,"location":2,"content":"um, it says often better."},{"from":4773.63,"to":4777.54,"location":2,"content":"Um, you can get them to work kind of as well as an LSTM does,"},{"from":4777.54,"to":4781.65,"location":2,"content":"but you could get them to work much faster because you're avoiding"},{"from":4781.65,"to":4786.56,"location":2,"content":"the standard recurrent operation and keeping it as something that you can parallelize,"},{"from":4786.56,"to":4789.94,"location":2,"content":"um, in the MaxPooling operations."},{"from":4789.94,"to":4793.03,"location":2,"content":"Um, yes, so that was a kind of"},{"from":4793.03,"to":4797.25,"location":2,"content":"an interesting alternative way of sort of trying to get some of the benefits."},{"from":4797.25,"to":4801.82,"location":2,"content":"I think long-term this isn't the idea that's going to end up winning out."},{"from":4801.82,"to":4805.74,"location":2,"content":"And so next week we're going to talk about transformer networks,"},{"from":4805.74,"to":4809.65,"location":2,"content":"which actually seems to be the idea that's gained the most steam at the moment."},{"from":4809.65,"to":4812.83,"location":2,"content":"Okay. I'll stop there for today. Thanks a lot."}]} \ No newline at end of file diff --git a/bcc-en/12.bcc b/bcc-en/12.bcc new file mode 100644 index 0000000000000000000000000000000000000000..604e78c15a7d2882ab48d511f77a94958b5057ac --- /dev/null +++ b/bcc-en/12.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[]} \ No newline at end of file diff --git a/bcc-en/13.bcc b/bcc-en/13.bcc new file mode 100644 index 0000000000000000000000000000000000000000..ccec5c70bd9c3993f01c3f10b5458c608506e64b --- /dev/null +++ b/bcc-en/13.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.66,"to":8.14,"location":2,"content":"Okay hi everyone."},{"from":8.14,"to":11.21,"location":2,"content":"Let's get started again."},{"from":11.21,"to":17.01,"location":2,"content":"Um. Okay. So, first of all for a couple of announcements."},{"from":17.01,"to":20.04,"location":2,"content":"Um, first of all thanks to everyone, um,"},{"from":20.04,"to":23.95,"location":2,"content":"who filled in our mid-quarter survey we've actually gotten,"},{"from":23.95,"to":27.32,"location":2,"content":"um, great participation in that."},{"from":27.32,"to":29.86,"location":2,"content":"Here are my two little Pac-Man figures."},{"from":29.86,"to":31.64,"location":2,"content":"So, the Pac-Man figures thinks,"},{"from":31.64,"to":34.58,"location":2,"content":"means that almost everyone thinks the lectures are at"},{"from":34.58,"to":38.39,"location":2,"content":"the right pace and those that don't are pretty much evenly divided."},{"from":38.39,"to":42.68,"location":2,"content":"Um, if we go for how challenging was Assignment three,"},{"from":42.68,"to":46.19,"location":2,"content":"slightly more people thought it was too easy than too hard."},{"from":46.19,"to":48.68,"location":2,"content":"So, I guess we're setting about rectifying that with"},{"from":48.68,"to":53.46,"location":2,"content":"assignments four and five, um, [NOISE]."},{"from":53.46,"to":56.03,"location":2,"content":"So, though there are a whole bunch of other questions and we've"},{"from":56.03,"to":59.18,"location":2,"content":"been trying to absorb all the feedback."},{"from":59.18,"to":63.98,"location":2,"content":"I mean one of the questions was what people wanted most from the remaining lectures."},{"from":63.98,"to":69.26,"location":2,"content":"I guess the good news here is really we're very good at predicting, um,"},{"from":69.26,"to":71.78,"location":2,"content":"what people wanted, that or else everybody"},{"from":71.78,"to":74.66,"location":2,"content":"just looked ahead in the syllabus and wrote down what it said was"},{"from":74.66,"to":79.16,"location":2,"content":"ahead in the syllabus but I guess the most popular four answers to"},{"from":79.16,"to":83.81,"location":2,"content":"topics that they wanted in the remaining lectures were Transformers and BERT,"},{"from":83.81,"to":85.86,"location":2,"content":"both of which are gonna be covered this week."},{"from":85.86,"to":89.78,"location":2,"content":"Uh, question-answering which we talked about last week, um,"},{"from":89.78,"to":93.23,"location":2,"content":"and then text generation and summarization"},{"from":93.23,"to":98.28,"location":2,"content":"and you guys get Abby back next week to talk about that."},{"from":98.28,"to":102.38,"location":2,"content":"Um, there are also a lot of people also answered this question"},{"from":102.38,"to":106.4,"location":2,"content":"a different way as to what kind of style of stuff,"},{"from":106.4,"to":110.96,"location":2,"content":"um, some people emphasized new research and the latest updates from the field."},{"from":110.96,"to":113.3,"location":2,"content":"I guess we'll get some of that today as well,"},{"from":113.3,"to":115.23,"location":2,"content":"some people are more interested in"},{"from":115.23,"to":120,"location":2,"content":"successful applications in industry or trying to do a bit of that,"},{"from":120,"to":122.84,"location":2,"content":"um, cool new neural architectures."},{"from":122.84,"to":125.66,"location":2,"content":"Um, the bottom answer wasn't the most popular one,"},{"from":125.66,"to":128.39,"location":2,"content":"I'll admit but at least a few people, um,"},{"from":128.39,"to":131.81,"location":2,"content":"wish that we were teaching more linguistic stuff."},{"from":131.81,"to":134.3,"location":2,"content":"Um, I mean that is something that I actually feel"},{"from":134.3,"to":138.58,"location":2,"content":"a bit awkward about the way things were merged with CS224N,"},{"from":138.58,"to":140.1,"location":2,"content":"with this deep learning,"},{"from":140.1,"to":142.37,"location":2,"content":"I mean the truth of the matter is that sort of seems"},{"from":142.37,"to":144.85,"location":2,"content":"like in the early part of the course,"},{"from":144.85,"to":147.21,"location":2,"content":"there's so much to cover with,"},{"from":147.21,"to":149.66,"location":2,"content":"um, neural networks, backpropagation,"},{"from":149.66,"to":154.25,"location":2,"content":"different, um, neural net architectures and so on that the reality is that we"},{"from":154.25,"to":159.89,"location":2,"content":"teach rather less linguistic stuff than we used to in the class."},{"from":159.89,"to":163.16,"location":2,"content":"I mean, for the last four weeks of the class we really do try and"},{"from":163.16,"to":166.73,"location":2,"content":"cover some more linguistic stuff topics."},{"from":166.73,"to":169.25,"location":2,"content":"Um, so look forward to that."},{"from":169.25,"to":171.25,"location":2,"content":"Um, announcements."},{"from":171.25,"to":174.37,"location":2,"content":"Okay. So we've made a couple of deadline changes."},{"from":174.37,"to":177.41,"location":2,"content":"Um, firstly, a number of people have"},{"from":177.41,"to":180.92,"location":2,"content":"mentioned that they think assignment five is a bit tough."},{"from":180.92,"to":184.16,"location":2,"content":"And so, we're giving people one extra day,"},{"from":184.16,"to":186.1,"location":2,"content":"um, to do assignment five."},{"from":186.1,"to":189.83,"location":2,"content":"Um, I'm realizing in one sense that one extra day is not a ton"},{"from":189.83,"to":193.76,"location":2,"content":"but you know there's sort of this complex balance here because on the other hand,"},{"from":193.76,"to":198.69,"location":2,"content":"we don't really want to undermine time that people have available for final projects."},{"from":198.69,"to":203.01,"location":2,"content":"And if you're one of the people who hasn't yet started assignment five,"},{"from":203.01,"to":206.07,"location":2,"content":"um, we do really encourage you to get underway on it."},{"from":206.07,"to":209.96,"location":2,"content":"Um, yeah, in the reverse direction"},{"from":209.96,"to":214.16,"location":2,"content":"we decided that the project milestone was really too late."},{"from":214.16,"to":218.06,"location":2,"content":"If we are going to be able to give you feedback on it that you could usefully make use"},{"from":218.06,"to":222.08,"location":2,"content":"of, so we're moving the project milestone date two days earlier."},{"from":222.08,"to":225.94,"location":2,"content":"And so, we've also gotten everyone's project proposals and our"},{"from":225.94,"to":230.27,"location":2,"content":"planned hope is to get them back to everybody on Friday."},{"from":230.27,"to":232.31,"location":2,"content":"Yes, so, a lot of things moving."},{"from":232.31,"to":236.97,"location":2,"content":"Um, and finally on other announcements I guess, um, on"},{"from":236.97,"to":241.64,"location":2,"content":"this Thursday is our first invited speaker, um, and so,"},{"from":241.64,"to":245.16,"location":2,"content":"if you're in person student you're meant to be here,"},{"from":245.16,"to":248.93,"location":2,"content":"um, and if you're not able to be here,"},{"from":248.93,"to":252.35,"location":2,"content":"you should know about our reaction paragraph policy and"},{"from":252.35,"to":256.46,"location":2,"content":"I actually stuck up on the Piazza pinned posts about, um,"},{"from":256.46,"to":261.38,"location":2,"content":"reaction pieces and attendance, an example of a reaction piece, um,"},{"from":261.38,"to":267.32,"location":2,"content":"from a past class to make it a little bit more concrete what's expected there."},{"from":267.32,"to":271.85,"location":2,"content":"But, you know, the idea is what we're hoping for something that isn't a ton of work."},{"from":271.85,"to":275.86,"location":2,"content":"You can just write 100, 150 words, a few sentences,"},{"from":275.86,"to":280.04,"location":2,"content":"but wanting you to pick out a specific thing that was"},{"from":280.04,"to":282.41,"location":2,"content":"interesting and write a couple of sentences"},{"from":282.41,"to":285.14,"location":2,"content":"about what it was and what your thoughts are about it."},{"from":285.14,"to":290.27,"location":2,"content":"I, not just some very generic statement of this was a lecture about transformers."},{"from":290.27,"to":292.81,"location":2,"content":"He talked about transformers and it was interesting,"},{"from":292.81,"to":298.63,"location":2,"content":"that is not what we want for the reaction piece. Um, okay."},{"from":298.63,"to":301.14,"location":2,"content":"So, here's the plan for today."},{"from":301.14,"to":304.95,"location":2,"content":"So, for today's, what I want to talk about is,"},{"from":304.95,"to":309.79,"location":2,"content":"um, the exciting recent work about contextual word representations."},{"from":309.79,"to":315.62,"location":2,"content":"I mean I, I was thinking of what I was gonna say I was wanting to say, oh, this is"},{"from":315.62,"to":318.77,"location":2,"content":"the most exciting thing in deep learning for NLP in"},{"from":318.77,"to":322.04,"location":2,"content":"the last five years then something's just completely wrong,"},{"from":322.04,"to":327.08,"location":2,"content":"because really this is the most exciting thing in deep learning that happened in 2018."},{"from":327.08,"to":329.96,"location":2,"content":"I mean, I guess things move very quickly, um,"},{"from":329.96,"to":333.53,"location":2,"content":"in deep learning at the moment and it's sort of I don't think it's"},{"from":333.53,"to":338.13,"location":2,"content":"really fair to say that you know it's got 5 years of life."},{"from":338.13,"to":340.49,"location":2,"content":"But there's a very exciting thing that happened last year,"},{"from":340.49,"to":342.65,"location":2,"content":"and we'll talk about that."},{"from":342.65,"to":345.72,"location":2,"content":"Okay. So, we'll talk about early stuff,"},{"from":345.72,"to":347.45,"location":2,"content":"the ELMo, ULMfit,"},{"from":347.45,"to":350.63,"location":2,"content":"transformer architectures briefly and then go on to"},{"from":350.63,"to":355.67,"location":2,"content":"talk about the BERT model that's being quite prominent lately."},{"from":355.67,"to":358.21,"location":2,"content":"So, let's just recap,"},{"from":358.21,"to":362.61,"location":2,"content":"let's just go backwards a bit first to think about, um,"},{"from":362.61,"to":368.07,"location":2,"content":"where we've been and where we are now and why we might want something more."},{"from":368.07,"to":369.69,"location":2,"content":"So, up until now,"},{"from":369.69,"to":371.06,"location":2,"content":"we've sort of just had,"},{"from":371.06,"to":376.25,"location":2,"content":"one representation for words which is what we learned at the beginning of class,"},{"from":376.25,"to":382.08,"location":2,"content":"there was a word, you trained a word vector for it and that's what you used in your model."},{"from":382.08,"to":384.77,"location":2,"content":"Um, and you could do that, with algorithms like Word2vec,"},{"from":384.77,"to":388.07,"location":2,"content":"GloVe, or fastText that I mentioned last week."},{"from":388.07,"to":394.46,"location":2,"content":"Um, so some on this sort of progression of ideas in deep learning,"},{"from":394.46,"to":399.05,"location":2,"content":"when deep learning for NLP or the general"},{"from":399.05,"to":402.06,"location":2,"content":"just the resurgence of neural networks for NLP"},{"from":402.06,"to":405.62,"location":2,"content":"came about sort of at the beginning of this decade."},{"from":405.62,"to":410.64,"location":2,"content":"Um, these pre-trained word vectors."},{"from":410.64,"to":414.79,"location":2,"content":"So, pre-trained unsupervised over a large amount of text."},{"from":414.79,"to":418.27,"location":2,"content":"They were completely seen as the secret sauce,"},{"from":418.27,"to":420.81,"location":2,"content":"and they were the thing that transformed"},{"from":420.81,"to":424.8,"location":2,"content":"neural networks from NLP to something that didn't really work,"},{"from":424.8,"to":426.65,"location":2,"content":"to something that worked great."},{"from":426.65,"to":429.91,"location":2,"content":"Um, so, this is actually an old slide of mine."},{"from":429.91,"to":432.67,"location":2,"content":"So, this is a slide I guess I first made for"},{"from":432.67,"to":438.49,"location":2,"content":"2012 ACL tutorial and then sort of used in lectures."},{"from":438.49,"to":443,"location":2,"content":"Sort of in 2013, 2014. Um-."},{"from":443,"to":446.46,"location":2,"content":"And so this was sort of the picture in those years."},{"from":446.46,"to":448,"location":2,"content":"So this was looking at two tasks,"},{"from":448,"to":453.12,"location":2,"content":"part of speech tagging and named entity recognition which I'll use quite a bit today."},{"from":453.12,"to":458.27,"location":2,"content":"And, you know, the top line was showing a state of the art which was"},{"from":458.27,"to":462.78,"location":2,"content":"a traditional categorical feature based classifier of the kind"},{"from":462.78,"to":467.44,"location":2,"content":"that dominated NLP in the 2000s decade, in their performance."},{"from":467.44,"to":473.21,"location":2,"content":"And what then the next line showed is that if you took the same data set"},{"from":473.21,"to":479.51,"location":2,"content":"and you trained a supervised neural network on it and said how good is your performance?"},{"from":479.51,"to":481.85,"location":2,"content":"Um, the story was, it wasn't great."},{"from":481.85,"to":486.72,"location":2,"content":"Um, part-of-speech tagging has very high numbers always for various reasons."},{"from":486.72,"to":491.39,"location":2,"content":"So perhaps the more indicative one to look at is these named entity recognition numbers."},{"from":491.39,"to":494.53,"location":2,"content":"So, you know, this was sort of neural net sucked, right?"},{"from":494.53,"to":498.31,"location":2,"content":"The reason why last decade everybody used, um,"},{"from":498.31,"to":500.79,"location":2,"content":"categorical feature based, you know,"},{"from":500.79,"to":503.34,"location":2,"content":"CRF, SVM kind of classifiers."},{"from":503.34,"to":507,"location":2,"content":"Well, if you look, it worked eight percent better than a neural network."},{"from":507,"to":508.33,"location":2,"content":"Why wouldn't anybody?"},{"from":508.33,"to":512.85,"location":2,"content":"But then what had happened was people had come up with this idea that we could"},{"from":512.85,"to":517.51,"location":2,"content":"do unsupervised pre-training of word representations,"},{"from":517.51,"to":520.77,"location":2,"content":"um, to come up with word vectors for words."},{"from":520.77,"to":522.06,"location":2,"content":"And, you know, in those days,"},{"from":522.06,"to":525.62,"location":2,"content":"this was very hard to do the alg- both because of"},{"from":525.62,"to":529.5,"location":2,"content":"the kind of algorithms and the kind of machines that were available, right?"},{"from":529.5,"to":531.99,"location":2,"content":"So Collobert and Weston, 2011,"},{"from":531.99,"to":536.98,"location":2,"content":"spent seven weeks training their unsupervised word representations."},{"from":536.98,"to":538.11,"location":2,"content":"And at the end of the day,"},{"from":538.11,"to":541.79,"location":2,"content":"there are only 100 dimensional, um, word representations."},{"from":541.79,"to":543.87,"location":2,"content":"But this was the miracle breakthrough, right?"},{"from":543.87,"to":548.97,"location":2,"content":"You've put in this miracle breakthrough of unsupervised word representations."},{"from":548.97,"to":552.22,"location":2,"content":"And now, the neural net is getting to 88.87."},{"from":552.22,"to":555.38,"location":2,"content":"So it's almost as good as the feature-based classifier,"},{"from":555.38,"to":557.51,"location":2,"content":"and then like any good engineers,"},{"from":557.51,"to":559.5,"location":2,"content":"they did some hacking with some extra features,"},{"from":559.5,"to":561.17,"location":2,"content":"because they had some stuff like that."},{"from":561.17,"to":566.72,"location":2,"content":"And they got a system that was then slightly better than the feature based system."},{"from":566.72,"to":569.52,"location":2,"content":"Okay. So that was sort of our picture that,"},{"from":569.52,"to":573.18,"location":2,"content":"um, having these pre-trained,"},{"from":573.18,"to":576.56,"location":2,"content":"unsuper- and unsupervised manner of word representations,"},{"from":576.56,"to":579,"location":2,"content":"that was sort of the big breakthrough and"},{"from":579,"to":582.28,"location":2,"content":"the secret sauce that gave all the oomph that made,"},{"from":582.28,"to":584.76,"location":2,"content":"um, neural networks competitive."},{"from":584.76,"to":586.2,"location":2,"content":"Um, but, you know,"},{"from":586.2,"to":591.57,"location":2,"content":"it's a sort of a funny thing happened which was after people had sort of had"},{"from":591.57,"to":594.43,"location":2,"content":"some of these initial breakthroughs which were"},{"from":594.43,"to":597.51,"location":2,"content":"all about unsupervised methods for pre-training,"},{"from":597.51,"to":599.03,"location":2,"content":"it was the same in vision."},{"from":599.03,"to":600.63,"location":2,"content":"This was the era in vision,"},{"from":600.63,"to":603.38,"location":2,"content":"where you were building restricted Boltzmann machines and doing"},{"from":603.38,"to":607.41,"location":2,"content":"complicated unsupervised pre-training techniques on them as well."},{"from":607.41,"to":613.42,"location":2,"content":"Some- somehow, after people had kind of discovered that and started to get good on it,"},{"from":613.42,"to":616.26,"location":2,"content":"people sort of started to discover, well,"},{"from":616.26,"to":620.28,"location":2,"content":"actually we have some new technologies for non-linearities,"},{"from":620.28,"to":622.66,"location":2,"content":"regularization, and things like that."},{"from":622.66,"to":625.83,"location":2,"content":"And if we keep using those same technologies,"},{"from":625.83,"to":629.77,"location":2,"content":"we can just go back to good old supervised learning."},{"from":629.77,"to":634.51,"location":2,"content":"And shockingly, it works way better now inside neural networks."},{"from":634.51,"to":637.44,"location":2,"content":"And so if you sort of go ahead to what I will call,"},{"from":637.44,"to":643.62,"location":2,"content":"sort of 2014 to 2018 picture,"},{"from":643.62,"to":646.45,"location":2,"content":"the, the picture is actually very different."},{"from":646.45,"to":648.27,"location":2,"content":"So the picture is, so this,"},{"from":648.27,"to":651.29,"location":2,"content":"the results I'm actually gonna show you this is from the Chen and Manning,"},{"from":651.29,"to":654.55,"location":2,"content":"um, neural dependency parser that we talked about weeks ago."},{"from":654.55,"to":656.7,"location":2,"content":"The picture there was, um,"},{"from":656.7,"to":659.07,"location":2,"content":"and you could- despite the fact that"},{"from":659.07,"to":663,"location":2,"content":"this dependency parser is being trained on a pretty small corpus,"},{"from":663,"to":665.64,"location":2,"content":"a million words of supervised data,"},{"from":665.64,"to":669.25,"location":2,"content":"you can just initialize it with random word vectors,"},{"from":669.25,"to":671.83,"location":2,"content":"um, and train a dependency parser."},{"from":671.83,"to":673.74,"location":2,"content":"And to a first approximation,"},{"from":673.74,"to":675.06,"location":2,"content":"it just works fine."},{"from":675.06,"to":677.93,"location":2,"content":"You get, get sort of a 90 percent accuracy,"},{"from":677.93,"to":680.42,"location":2,"content":"E- um, English dependency parser."},{"from":680.42,"to":683.74,"location":2,"content":"Now, it is the case that instead,"},{"from":683.74,"to":687.48,"location":2,"content":"you could use pre-trained word embeddings and you do a bit better."},{"from":687.48,"to":689.23,"location":2,"content":"You do about one percent better."},{"from":689.23,"to":691.55,"location":2,"content":"And so this was sort of the,"},{"from":691.55,"to":695.73,"location":2,"content":"the new world order which was yeah, um,"},{"from":695.73,"to":700.86,"location":2,"content":"these pre-trained unsupervised word embeddings are useful because you can"},{"from":700.86,"to":706.84,"location":2,"content":"train them from a lot more data and they can know about a much larger vocabulary."},{"from":706.84,"to":708.02,"location":2,"content":"That means they are useful."},{"from":708.02,"to":711.8,"location":2,"content":"They help with rare words and things like that and they give you a percent,"},{"from":711.8,"to":715.16,"location":2,"content":"but they're definitely no longer the sort of night and day,"},{"from":715.16,"to":719.91,"location":2,"content":"uh, thing to make neural networks work that we used to believe."},{"from":719.91,"to":724.47,"location":2,"content":"I'm, I'm just gonna deviate here to,"},{"from":724.47,"to":728.46,"location":2,"content":"from the main narrative to just sort of say, um,"},{"from":728.46,"to":733.49,"location":2,"content":"one more tip for dealing with unknown words with word vectors,"},{"from":733.49,"to":736.29,"location":2,"content":"um, just in case it's useful for some people,"},{"from":736.29,"to":739.35,"location":2,"content":"building question answering systems, right?"},{"from":739.35,"to":744.45,"location":2,"content":"So, um, so for sort of word vectors on unknown words, you know,"},{"from":744.45,"to":749.7,"location":2,"content":"the commonest thing historically is you've got your supervised training data,"},{"from":749.7,"to":752.79,"location":2,"content":"you define a vocab which might be words that occur"},{"from":752.79,"to":756.25,"location":2,"content":"five times or more in your supervised training data."},{"from":756.25,"to":759.04,"location":2,"content":"And you treat everything else as an UNK."},{"from":759.04,"to":762.09,"location":2,"content":"And so you also train one vector per UNK."},{"from":762.09,"to":766.14,"location":2,"content":"Um, but that has some problems which you have no way to"},{"from":766.14,"to":771.25,"location":2,"content":"distinguish different UNK words either for identity or meaning."},{"from":771.25,"to":774.75,"location":2,"content":"And that tends to be problematic for question answering systems."},{"from":774.75,"to":778.14,"location":2,"content":"And so one way to fix that is what we talked about last week,"},{"from":778.14,"to":780.63,"location":2,"content":"you just say, \"Oh, words are made out of characters."},{"from":780.63,"to":785.65,"location":2,"content":"I can use character representations to learn word vectors for other words.\""},{"from":785.65,"to":786.96,"location":2,"content":"And you can certainly do that."},{"from":786.96,"to":788.23,"location":2,"content":"You might wanna try that."},{"from":788.23,"to":790.21,"location":2,"content":"That adds some complexity."},{"from":790.21,"to":794.38,"location":2,"content":"Um, but especially for things like question answering systems,"},{"from":794.38,"to":796.38,"location":2,"content":"there are a couple of other things that you can do"},{"from":796.38,"to":798.72,"location":2,"content":"that work considerably better and they've been"},{"from":798.72,"to":803.37,"location":2,"content":"explored in this paper by Dhingra et al., um, from 2017."},{"from":803.37,"to":806.7,"location":2,"content":"Um, the first one is to say, well, um,"},{"from":806.7,"to":814.25,"location":2,"content":"when you at test-time encounter new words, probably your unsupervised word,"},{"from":814.25,"to":819.34,"location":2,"content":"pre-trained word embeddings have a much bigger vocabulary than your actual system does."},{"from":819.34,"to":822.09,"location":2,"content":"So anytime you come across a word that isn't in"},{"from":822.09,"to":824.96,"location":2,"content":"your vocab but is in the pre-trained word embeddings,"},{"from":824.96,"to":828.99,"location":2,"content":"just use, get the word vector of that word and start using it."},{"from":828.99,"to":831.75,"location":2,"content":"That'll be a much more useful thing to use."},{"from":831.75,"to":833.85,"location":2,"content":"And then there's a second possible tip that if you"},{"from":833.85,"to":836.3,"location":2,"content":"see something that's still an unknown word,"},{"from":836.3,"to":837.92,"location":2,"content":"rather than treating it as UNK,"},{"from":837.92,"to":840.03,"location":2,"content":"you just assign it on the spot,"},{"from":840.03,"to":841.75,"location":2,"content":"a random word vector."},{"from":841.75,"to":846.81,"location":2,"content":"And so this has the effect that each word does get a unique identity."},{"from":846.81,"to":849.36,"location":2,"content":"Which means if you see the same word in the question,"},{"from":849.36,"to":851.07,"location":2,"content":"and a potential answer,"},{"from":851.07,"to":854.94,"location":2,"content":"they will match together beautifully in an accurate way which you're"},{"from":854.94,"to":859.89,"location":2,"content":"not getting with just UNK matching and those can be kind of useful ideas to try."},{"from":859.89,"to":865.27,"location":2,"content":"Okay, end digression. Okay, so up until now,"},{"from":865.27,"to":868.23,"location":2,"content":"we just sort of had this representation of words,"},{"from":868.23,"to":871.6,"location":2,"content":"we ran Word2vec and we got a word vector,"},{"from":871.6,"to":873.73,"location":2,"content":"um, for each word."},{"from":873.73,"to":877.57,"location":2,"content":"Um, so, um, that, that was useful."},{"from":877.57,"to":879.11,"location":2,"content":"It's worked pretty well."},{"from":879.11,"to":881.54,"location":2,"content":"Um, but it had, um,"},{"from":881.54,"to":888.53,"location":2,"content":"some big problems. So what were the big problems of doing that?"},{"from":888.53,"to":890.78,"location":2,"content":"The problems when we,"},{"from":890.78,"to":893.48,"location":2,"content":"of having a word vector in each word, yes."},{"from":893.48,"to":896.83,"location":2,"content":"A lot of words have like one spelling, but a whole bunch of meanings."},{"from":896.83,"to":900.55,"location":2,"content":"Right, so, a word can have- So, typically,"},{"from":900.55,"to":905.62,"location":2,"content":"you have one string of letters which has a whole bunch of meanings."},{"from":905.62,"to":909.22,"location":2,"content":"So, words have a ton of senses."},{"from":909.22,"to":911.35,"location":2,"content":"Um, and yeah, so that's"},{"from":911.35,"to":913.4,"location":2,"content":"the biggest and most obvious problem that we're"},{"from":913.4,"to":915.55,"location":2,"content":"collapsing together all the meanings of words."},{"from":915.55,"to":918.13,"location":2,"content":"So, we talked about a bit where"},{"from":918.13,"to":920.29,"location":2,"content":"one solution to that was you could distinguish"},{"from":920.29,"to":923.68,"location":2,"content":"word senses and to have different word vectors for them."},{"from":923.68,"to":927.7,"location":2,"content":"Um, and I then said something about also you could think of"},{"from":927.7,"to":931.75,"location":2,"content":"this word vector as a sort of a mixture of them and maybe your model could separate it."},{"from":931.75,"to":935.07,"location":2,"content":"But it seems like we might want to take that more seriously."},{"from":935.07,"to":937.42,"location":2,"content":"And one way, um,"},{"from":937.42,"to":943.35,"location":2,"content":"that we could take that more seriously is we could start to say, well,"},{"from":943.35,"to":950.07,"location":2,"content":"really, you know, traditional lists of word senses are themselves a crude approximation."},{"from":950.07,"to":957.92,"location":2,"content":"What we actually want to know is the sense of the word inside a particular context of use."},{"from":957.92,"to":960.4,"location":2,"content":"And sort of what I mean by that is, you know,"},{"from":960.4,"to":964.57,"location":2,"content":"we distinguish different senses of a word, right?"},{"from":964.57,"to":968.38,"location":2,"content":"Say for the word star there's the astronomical sense and"},{"from":968.38,"to":972.41,"location":2,"content":"there's the Hollywood sense and they're clearly different."},{"from":972.41,"to":976.27,"location":2,"content":"But you know, if we then go to this what I'm calling the Hollywood sense,"},{"from":976.27,"to":978.37,"location":2,"content":"I could then say, well, wait a minute."},{"from":978.37,"to":981.52,"location":2,"content":"There are movie stars and there are rock stars,"},{"from":981.52,"to":984.07,"location":2,"content":"and there, uh, are R&B stars,"},{"from":984.07,"to":985.83,"location":2,"content":"and there are country stars."},{"from":985.83,"to":989.27,"location":2,"content":"Now, all of those different senses, um,"},{"from":989.27,"to":993.02,"location":2,"content":"in certain contexts, though, one or other of them would be evoked."},{"from":993.02,"to":994.21,"location":2,"content":"And so, you know,"},{"from":994.21,"to":996.88,"location":2,"content":"it's very hard if you're trying to actually enumerate"},{"from":996.88,"to":1000.83,"location":2,"content":"senses of a word as to which ones count as different or the same."},{"from":1000.83,"to":1004.78,"location":2,"content":"So, it's really you sort of wanna know what a word means in a context."},{"from":1004.78,"to":1010.51,"location":2,"content":"There's a second limitation of these word vectors which is,"},{"from":1010.51,"to":1013.71,"location":2,"content":"we haven't really talked about and is less obvious,"},{"from":1013.71,"to":1016.77,"location":2,"content":"but it's also something that we might want to fix, and at least one of"},{"from":1016.77,"to":1020.07,"location":2,"content":"the models we discussed today takes some aim at that,"},{"from":1020.07,"to":1024.05,"location":2,"content":"and that is, we just sort of have one vector for a word."},{"from":1024.05,"to":1027.46,"location":2,"content":"But there are sort of different dimensions of a word."},{"from":1027.46,"to":1030.39,"location":2,"content":"So, words can have different meanings,"},{"from":1030.39,"to":1034.61,"location":2,"content":"some sort of real semantics or words can have"},{"from":1034.61,"to":1039.77,"location":2,"content":"different syntactic behavior like different parts of speech or grammatical behavior."},{"from":1039.77,"to":1043.07,"location":2,"content":"So, in some sense, arrive and arrival,"},{"from":1043.07,"to":1045.67,"location":2,"content":"their semantics are almost the same,"},{"from":1045.67,"to":1048.99,"location":2,"content":"but they're different parts of speech."},{"from":1048.99,"to":1052.41,"location":2,"content":"One is a, um, a verb and one is a noun,"},{"from":1052.41,"to":1055.38,"location":2,"content":"so they can kind of appear in quite different places."},{"from":1055.38,"to":1059.13,"location":2,"content":"And you know, you'd wanna do different things with them in a dependency parser."},{"from":1059.13,"to":1061.29,"location":2,"content":"And there are even other dimensions."},{"from":1061.29,"to":1067.2,"location":2,"content":"So, words also have register and connotation differences."},{"from":1067.2,"to":1072.27,"location":2,"content":"So, you can probably think of lots of different words for a bathroom,"},{"from":1072.27,"to":1076.17,"location":2,"content":"and a lot of those words all means semantically the same,"},{"from":1076.17,"to":1078.33,"location":2,"content":"but have rather different registers and"},{"from":1078.33,"to":1081.22,"location":2,"content":"connotations as to when they're appropriate to use."},{"from":1081.22,"to":1084.9,"location":2,"content":"And so, we might want to distinguish words on that basis as well."},{"from":1084.9,"to":1088.35,"location":2,"content":"And so these are the kinds of soluti- things we want to"},{"from":1088.35,"to":1091.85,"location":2,"content":"solve with our new contextual word embeddings."},{"from":1091.85,"to":1096.24,"location":2,"content":"Um, so I've said up until now, you know,"},{"from":1096.24,"to":1101.67,"location":2,"content":"oh, we just had these word vectors that we use,"},{"from":1101.67,"to":1104.2,"location":2,"content":"words just had one vector."},{"from":1104.2,"to":1109.17,"location":2,"content":"Um, but if you actually think about it, maybe that's wrong."},{"from":1109.17,"to":1114.27,"location":2,"content":"I mean, maybe we never had a problem, or at any rate, we solved it six classes ago."},{"from":1114.27,"to":1116.2,"location":2,"content":"Because if you remember back, [NOISE] um,"},{"from":1116.2,"to":1119.46,"location":2,"content":"to when we started talking about neural language models,"},{"from":1119.46,"to":1121.95,"location":2,"content":"well, what did a neural language model do?"},{"from":1121.95,"to":1125.1,"location":2,"content":"At the bottom, you fed into it the word vectors."},{"from":1125.1,"to":1129.6,"location":2,"content":"But then you ran across that one or more recurrent layers,"},{"from":1129.6,"to":1131.57,"location":2,"content":"something like a LSTM layer,"},{"from":1131.57,"to":1137.43,"location":2,"content":"and it was calculating these representations that sit above each word and,"},{"from":1137.43,"to":1140.76,"location":2,"content":"you know, the role of those hidden states is a bit ambivalent."},{"from":1140.76,"to":1142.26,"location":2,"content":"They are used for prediction."},{"from":1142.26,"to":1146.67,"location":2,"content":"And they are used for next hidden state and output states and so on."},{"from":1146.67,"to":1149.2,"location":2,"content":"But in many ways you can think huh,"},{"from":1149.2,"to":1156.05,"location":2,"content":"these representations are actually representations of a word in context."},{"from":1156.05,"to":1158.89,"location":2,"content":"And if you think about what happened with, uh,"},{"from":1158.89,"to":1161.31,"location":2,"content":"the question answering systems,"},{"from":1161.31,"to":1163.62,"location":2,"content":"that's exactly how they were used, right?"},{"from":1163.62,"to":1166.2,"location":2,"content":"We ran LSTM's backwards and forwards,"},{"from":1166.2,"to":1169.32,"location":2,"content":"over a question in the passage, and then we say,"},{"from":1169.32,"to":1173.09,"location":2,"content":"okay those are a good representation of a word's meaning and context."},{"from":1173.09,"to":1176.2,"location":2,"content":"Let's start matching them with attention functions et cetera."},{"from":1176.2,"to":1181.47,"location":2,"content":"So, it sort of seemed like we'd already invented a way to have,"},{"from":1181.47,"to":1187.18,"location":2,"content":"um, context-specific representations of words."},{"from":1187.18,"to":1190.04,"location":2,"content":"And effectively, you know,"},{"from":1190.04,"to":1195.45,"location":2,"content":"the rest of the content of this lecture is sort of basically no more complex than that."},{"from":1195.45,"to":1202.24,"location":2,"content":"Um, that it took a while but sort of people woke up and started to notice, huh,"},{"from":1202.24,"to":1204.6,"location":2,"content":"really when you're running any language model,"},{"from":1204.6,"to":1208.37,"location":2,"content":"you generate a context-specific representation of words."},{"from":1208.37,"to":1211.36,"location":2,"content":"Maybe, if we just took those context-specific"},{"from":1211.36,"to":1216.81,"location":2,"content":"representation of words, they'd be useful for doing other things with them."},{"from":1216.81,"to":1218.6,"location":2,"content":"And that's sort of, you know,"},{"from":1218.6,"to":1219.78,"location":2,"content":"there are a few more details,"},{"from":1219.78,"to":1223.98,"location":2,"content":"but that's really the summary of the entire of this lecture."},{"from":1223.98,"to":1234.16,"location":2,"content":"Um, so one of the first things to do that was a paper that Matt Peters wrote in 2017,"},{"from":1234.16,"to":1236.27,"location":2,"content":"um, the year before last."},{"from":1236.27,"to":1241.08,"location":2,"content":"Um, and this was sort of a predecessor to the sort of modern, um,"},{"from":1241.08,"to":1246.72,"location":2,"content":"versions of, um, these context-sensitive word embeddings."},{"from":1246.72,"to":1249.84,"location":2,"content":"So, um, together with co-authors,"},{"from":1249.84,"to":1253.18,"location":2,"content":"he came up with a paper called TagLM,"},{"from":1253.18,"to":1256.92,"location":2,"content":"but it essentially already had all the main ideas."},{"from":1256.92,"to":1261.26,"location":2,"content":"So, what, um, was wanted was okay."},{"from":1261.26,"to":1265.62,"location":2,"content":"We want to do better at tasks such as named-entity recognition."},{"from":1265.62,"to":1270.94,"location":2,"content":"And what we'd like to do is know about the meaning of a word in context."},{"from":1270.94,"to":1274.5,"location":2,"content":"Um, but you know, standardly if we're doing named-entity recognition,"},{"from":1274.5,"to":1278.17,"location":2,"content":"we just train it on half a million words of supervised data."},{"from":1278.17,"to":1280.23,"location":2,"content":"And that's not much of a source of"},{"from":1280.23,"to":1283.95,"location":2,"content":"information to be learning about the meaning of words and context."},{"from":1283.95,"to":1288.81,"location":2,"content":"So, why don't we adopt the semi-supervised approach and so that's what we do."},{"from":1288.81,"to":1292.74,"location":2,"content":"So, we start off with a ton of unlabeled data."},{"from":1292.74,"to":1295.55,"location":2,"content":"Um, and from that unlabeled data,"},{"from":1295.55,"to":1299.85,"location":2,"content":"we can train a conventional word embedding model like Word2vec."},{"from":1299.85,"to":1303.81,"location":2,"content":"But we can also at the same time train a neural language model."},{"from":1303.81,"to":1307.59,"location":2,"content":"So, something like a bi-LSTM language model."},{"from":1307.59,"to":1315.74,"location":2,"content":"Okay. So, then for step two when we're using our supervised data,"},{"from":1315.74,"to":1318.9,"location":2,"content":"um, actually, I guess that's step three."},{"from":1318.9,"to":1325.96,"location":2,"content":"Okay. Um, so for then when we want to learn our supervised part-of-speech tagger at the top,"},{"from":1325.96,"to":1329.19,"location":2,"content":"what we're gonna do is say, well,"},{"from":1329.19,"to":1333.42,"location":2,"content":"for the input words New what York is located,"},{"from":1333.42,"to":1338.34,"location":2,"content":"we can not only use the word embedding which is context independent,"},{"from":1338.34,"to":1344.51,"location":2,"content":"but we can use our trained recurrent language model and also run it over this import,"},{"from":1344.51,"to":1351.18,"location":2,"content":"and then we'll generate hidden states in our bi-LSTM language model and we can also"},{"from":1351.18,"to":1358.38,"location":2,"content":"feed those in as features into ou- our sequence tagging model,"},{"from":1358.38,"to":1361.34,"location":2,"content":"and those features will let it work better."},{"from":1361.34,"to":1367.1,"location":2,"content":"Here's a second picture that runs this through in much greater detail."},{"from":1367.1,"to":1372.88,"location":2,"content":"So, so, we're assuming that we have trained, uh,"},{"from":1372.88,"to":1376.88,"location":2,"content":"bi-LSTM language model, um,"},{"from":1376.88,"to":1379.76,"location":2,"content":"on a lot of unsupervised data."},{"from":1379.76,"to":1386.37,"location":2,"content":"Then what we wanna do is we want to do named entity recognition for New York is located."},{"from":1386.37,"to":1389.16,"location":2,"content":"So, the first thing we do is say,"},{"from":1389.16,"to":1396.15,"location":2,"content":"let's just run New York is located through our separately trained neural language model."},{"from":1396.15,"to":1398.92,"location":2,"content":"So, we run it through a forward language model."},{"from":1398.92,"to":1401.49,"location":2,"content":"We run it through a backward language model."},{"from":1401.49,"to":1403.83,"location":2,"content":"We get from that, um,"},{"from":1403.83,"to":1406.52,"location":2,"content":"a hidden state representation,"},{"from":1406.52,"to":1408.75,"location":2,"content":"um, for each word,"},{"from":1408.75,"to":1411.64,"location":2,"content":"we concatenate the forward and backward ones,"},{"from":1411.64,"to":1415.53,"location":2,"content":"and that's going to give a set, a concatenated language model embedding"},{"from":1415.53,"to":1420.09,"location":2,"content":"which we'll use as features in our named entity recognizer."},{"from":1420.09,"to":1423.87,"location":2,"content":"So, then for the named entity recognizer itself that we're gonna"},{"from":1423.87,"to":1428.7,"location":2,"content":"train supervised while we have the same sentence,"},{"from":1428.7,"to":1435.39,"location":2,"content":"so we can both look up a Word2vec-style token embedding for it."},{"from":1435.39,"to":1441.32,"location":2,"content":"We can use what we learned about with character level CNNs and RNNs and we can build"},{"from":1441.32,"to":1444.45,"location":2,"content":"a character level representation for it which we also"},{"from":1444.45,"to":1447.8,"location":2,"content":"concatenate to have two representations."},{"from":1447.8,"to":1455.68,"location":2,"content":"So, we feed these representations into a bi-LSTM layer."},{"from":1455.68,"to":1459.94,"location":2,"content":"But then when we get the output of the, this bi-LSTM layer,"},{"from":1459.94,"to":1462.18,"location":2,"content":"as well as this normal output,"},{"from":1462.18,"to":1468.29,"location":2,"content":"we can concatenate with each output what was- what we get from our,"},{"from":1468.29,"to":1470.73,"location":2,"content":"um, neural language model."},{"from":1470.73,"to":1473.37,"location":2,"content":"So, each of these things becomes a pair of states."},{"from":1473.37,"to":1476.49,"location":2,"content":"One that's spit up from the first bi-LSTM layer and"},{"from":1476.49,"to":1479.76,"location":2,"content":"then it's concatenated with something from the neural language model."},{"from":1479.76,"to":1486.45,"location":2,"content":"And so that concatenated representation is then fed into a second layer of bi-LSTM."},{"from":1486.45,"to":1488.27,"location":2,"content":"And then from the output of that,"},{"from":1488.27,"to":1491.31,"location":2,"content":"we do the usual kind of softmax classification"},{"from":1491.31,"to":1494.79,"location":2,"content":"where we're then giving tags like beginning of location,"},{"from":1494.79,"to":1499.38,"location":2,"content":"end of location, say New York is a location and then is, we'll get"},{"from":1499.38,"to":1507.86,"location":2,"content":"another tag to say it's not a location. Does that makes sense?"},{"from":1507.86,"to":1514.31,"location":2,"content":"Yeah so, um, so the central thing is"},{"from":1514.31,"to":1520.45,"location":2,"content":"sort of having seen that these sort of representations that we get from Bi-LSTMs are useful."},{"from":1520.45,"to":1524.58,"location":2,"content":"We're just going to feed them into supervised models as we train them,"},{"from":1524.58,"to":1528.6,"location":2,"content":"and the idea is that will give us better features of words."},{"from":1528.6,"to":1532.31,"location":2,"content":"Some kind of representation of their meaning and context,"},{"from":1532.31,"to":1539.61,"location":2,"content":"which will allow us to learn better named entity recognizers or what it- whatever it is."},{"from":1539.61,"to":1542.58,"location":2,"content":"Maybe I should put this slide earlier,"},{"from":1542.58,"to":1545.95,"location":2,"content":"but this slide was meant to remind you what a named entity recognizer is."},{"from":1545.95,"to":1547.31,"location":2,"content":"I hope you remember that,"},{"from":1547.31,"to":1550.61,"location":2,"content":"something where are we going to find and label"},{"from":1550.61,"to":1554.85,"location":2,"content":"entities for things like person, location, date, organization."},{"from":1554.85,"to":1557.63,"location":2,"content":"So anyway, doing this worked."},{"from":1557.63,"to":1559.9,"location":2,"content":"So, here's a little bit of a history."},{"from":1559.9,"to":1567.29,"location":2,"content":"So the most famous Named Entity Recognition dataset is this CoNLL 2003 dataset,"},{"from":1567.29,"to":1570.18,"location":2,"content":"which actually exists in multiple languages."},{"from":1570.18,"to":1574.77,"location":2,"content":"But whenever people say CoNLL 2003 and don't mention a language,"},{"from":1574.77,"to":1577.66,"location":2,"content":"they mean the English version of it."},{"from":1577.66,"to":1580.41,"location":2,"content":"That's the way the world works."},{"from":1580.41,"to":1584.43,"location":2,"content":"Um, okay so on this dataset- yeah."},{"from":1584.43,"to":1589.04,"location":2,"content":"So, it's sort of been around for whatever, 15 years roughly now."},{"from":1589.04,"to":1592.92,"location":2,"content":"So, in the- so it was originally a competition, right?"},{"from":1592.92,"to":1596.23,"location":2,"content":"So, this is in 2003 was the original bake-off."},{"from":1596.23,"to":1598.45,"location":2,"content":"My group actually took place in that."},{"from":1598.45,"to":1602.06,"location":2,"content":"Took part in it. I think we got third or fourth place or something,"},{"from":1602.06,"to":1606.72,"location":2,"content":"and our F1 score was 86."},{"from":1606.72,"to":1612.81,"location":2,"content":"The people who won were from IBM Research Labs,"},{"from":1612.81,"to":1615.87,"location":2,"content":"and they got 88 almost 89."},{"from":1615.87,"to":1620.49,"location":2,"content":"But a difference between these two things is our system was"},{"from":1620.49,"to":1625.29,"location":2,"content":"a single clean machine-learning model categorical,"},{"from":1625.29,"to":1628.8,"location":2,"content":"whereas the IBM one was not only an ensemble"},{"from":1628.8,"to":1633.6,"location":2,"content":"of four different machine learning models, plus gazetteers."},{"from":1633.6,"to":1636.09,"location":2,"content":"It also fit in the output of"},{"from":1636.09,"to":1642.45,"location":2,"content":"two other old NER systems that IBM people were trained years ago on different data."},{"from":1642.45,"to":1645.03,"location":2,"content":"So it was- I guess it worked for them but,"},{"from":1645.03,"to":1647.1,"location":2,"content":"it was a fairly complex system."},{"from":1647.1,"to":1649.17,"location":2,"content":"Here's another system from Stanford."},{"from":1649.17,"to":1653.91,"location":2,"content":"So this was our classic Stanford NER system that is widely used."},{"from":1653.91,"to":1659.47,"location":2,"content":"So, this was then using a conditional random field model which generally dominated"},{"from":1659.47,"to":1666.93,"location":2,"content":"sort of the second half of the 2000s and the first half of the 2010s for doing NER,"},{"from":1666.93,"to":1672.8,"location":2,"content":"and it was sort of, you know, a bit but not usually better than the 2003 system."},{"from":1672.8,"to":1679.91,"location":2,"content":"This system here was sort of the best ever built categorical CRF system."},{"from":1679.91,"to":1686.11,"location":2,"content":"But rather than only using the training data to build the model as this system did,"},{"from":1686.11,"to":1691.07,"location":2,"content":"it threw in Wikipedia and other stuff to make it work better,"},{"from":1691.07,"to":1694.72,"location":2,"content":"and that got you to about 90.8 F1."},{"from":1694.72,"to":1703.77,"location":2,"content":"So, essentially, once sort of BiLSTM style models started to be known and used in NLP."},{"from":1703.77,"to":1708.06,"location":2,"content":"That was when people were able to train, build training"},{"from":1708.06,"to":1713.17,"location":2,"content":"just on the training data systems that worked a lot better."},{"from":1713.17,"to":1718.44,"location":2,"content":"Because essentially you're going from the same data from this system to that system."},{"from":1718.44,"to":1721.53,"location":2,"content":"So, you're getting about 4 percent gain on it,"},{"from":1721.53,"to":1725.84,"location":2,"content":"because it's not- wasn't making use of Wikipedia and things like that;"},{"from":1725.84,"to":1731.81,"location":2,"content":"and so this Ma and Hovy system is pretty well-known getting about 91.21."},{"from":1731.81,"to":1736.14,"location":2,"content":"Okay, but if we then go to this TagLM system, um,"},{"from":1736.14,"to":1740.61,"location":2,"content":"that Matt Peters and Co have a system that"},{"from":1740.61,"to":1745.59,"location":2,"content":"was sort of similar to the Ma and Hovy system that is a little bit worse."},{"from":1745.59,"to":1752.67,"location":2,"content":"But the point is that this BiLSTM uses sorry- using the neural language model,"},{"from":1752.67,"to":1757.08,"location":2,"content":"is just a useful oomph giver which sort of takes the results up."},{"from":1757.08,"to":1758.61,"location":2,"content":"Yeah, not night and day but,"},{"from":1758.61,"to":1764.16,"location":2,"content":"slightly over a percent and then gives them the best NER system that was then available."},{"from":1764.16,"to":1765.99,"location":2,"content":"So that sort of proved these sort of"},{"from":1765.99,"to":1773.66,"location":2,"content":"contextual word representations really had some power and started to be useful,"},{"from":1773.66,"to":1778.62,"location":2,"content":"and then there's a white space at the top because we'll get back to more of this later."},{"from":1778.62,"to":1783.24,"location":2,"content":"Um, there's some details on their language model."},{"from":1783.24,"to":1786.33,"location":2,"content":"Some of their details are that it's useful to have"},{"from":1786.33,"to":1789.29,"location":2,"content":"a bidirectional language model, not unidirectional."},{"from":1789.29,"to":1791.64,"location":2,"content":"It's useful to have a big um,"},{"from":1791.64,"to":1795.51,"location":2,"content":"language model to get much in the way of gains,"},{"from":1795.51,"to":1801.96,"location":2,"content":"um and, you need to train this language model over much more data."},{"from":1801.96,"to":1808.16,"location":2,"content":"It doesn't work if you're just sort of training it over your supervised training data."},{"from":1808.16,"to":1811.14,"location":2,"content":"Another model that was around was CoVe,"},{"from":1811.14,"to":1812.61,"location":2,"content":"but I think I'll skip that."},{"from":1812.61,"to":1815.89,"location":2,"content":"Okay. So, then the next year, um,"},{"from":1815.89,"to":1818.87,"location":2,"content":"Matt Peters and a different set of colleagues"},{"from":1818.87,"to":1823.41,"location":2,"content":"then came up with an improved system called ELMo,"},{"from":1823.41,"to":1827.61,"location":2,"content":"and effectively this was the breakthrough system."},{"from":1827.61,"to":1830.96,"location":2,"content":"That this was sort of just the system that everybody"},{"from":1830.96,"to":1835.88,"location":2,"content":"noticed and said \"Wow these contextual word vectors are great."},{"from":1835.88,"to":1837.68,"location":2,"content":"Everyone should be using them,"},{"from":1837.68,"to":1841.79,"location":2,"content":"not traditional word vectors.\" Yes?"},{"from":1841.79,"to":1859.33,"location":2,"content":"I have a simple question, imagine re-training a system, what exactly"},{"from":1859.33,"to":1862.91,"location":2,"content":"what measure [inaudible]"},{"from":1862.91,"to":1866.25,"location":2,"content":"It's pre-trained because this piece over here;"},{"from":1866.25,"to":1871.04,"location":2,"content":"a big neural language model is trained first,"},{"from":1871.04,"to":1873.27,"location":2,"content":"and there's an important thing I forgot to say."},{"from":1873.27,"to":1875.28,"location":2,"content":"So, thank you for the question."},{"from":1875.28,"to":1880.02,"location":2,"content":"The main reason why it's- in some sense pre-trained,"},{"from":1880.02,"to":1881.67,"location":2,"content":"is this was trained first."},{"from":1881.67,"to":1886.24,"location":2,"content":"But the main reason why people think of this as pre-training"},{"from":1886.24,"to":1890.98,"location":2,"content":"is after you've trained this, it is frozen."},{"from":1890.98,"to":1895.68,"location":2,"content":"So, this is just something that you can run with parameters which will give"},{"from":1895.68,"to":1900.84,"location":2,"content":"you a vector which is your contextual word representation each position,"},{"from":1900.84,"to":1903.96,"location":2,"content":"and then that's just going to be used in this system."},{"from":1903.96,"to":1906.42,"location":2,"content":"So, when you're training this system,"},{"from":1906.42,"to":1908.58,"location":2,"content":"there's no gradient flowing back into"},{"from":1908.58,"to":1912.88,"location":2,"content":"this neural language model that's changing and updating it; it's just fixed."},{"from":1912.88,"to":1916.26,"location":2,"content":"And so that's sort of the sense when people are talking about pre-training."},{"from":1916.26,"to":1919.18,"location":2,"content":"It's sort of normally a model that you trained"},{"from":1919.18,"to":1922.68,"location":2,"content":"somewhere else and that you're using to give features,"},{"from":1922.68,"to":1926.28,"location":2,"content":"but isn't part of the model that you are now training. Yeah?"},{"from":1926.28,"to":1932.06,"location":2,"content":"[inaudible]"},{"from":1932.06,"to":1936.65,"location":2,"content":"Well, I guess that's, I wouldn't quite call it reconstruction."},{"from":1936.65,"to":1940.19,"location":2,"content":"Yeah, it's unsupervised in the sense that this is a language model,"},{"from":1940.19,"to":1942.47,"location":2,"content":"you're training it to predict the next word."},{"from":1942.47,"to":1948.34,"location":2,"content":"So here are words one to k. What is the k plus oneth word during a cross entropy loss,"},{"from":1948.34,"to":1950.15,"location":2,"content":"and repeat over for each position."},{"from":1950.15,"to":1957.53,"location":2,"content":"[NOISE] Yes, so I mean,"},{"from":1957.53,"to":1965.24,"location":2,"content":"having gone through TagLM in some detail, I mean,"},{"from":1965.24,"to":1972.35,"location":2,"content":"in some sense, the difference between TagLM and ELMo is kind of small,"},{"from":1972.35,"to":1974.09,"location":2,"content":"it's sort of in the details."},{"from":1974.09,"to":1976.38,"location":2,"content":"So I mean, to a first approximation,"},{"from":1976.38,"to":1978.89,"location":2,"content":"they're doing exactly the same again,"},{"from":1978.89,"to":1980.67,"location":2,"content":"but a little bit better."},{"from":1980.67,"to":1986.36,"location":2,"content":"Um, so, um, I sort of hope it made sense the last time,"},{"from":1986.36,"to":1989.02,"location":2,"content":"I mean, what are the things that are different?"},{"from":1989.02,"to":1993.71,"location":2,"content":"Um, they do the bidirectional language model a bit differently,"},{"from":1993.71,"to":1996.8,"location":2,"content":"and actually one of their concerns was to try and come up with"},{"from":1996.8,"to":2001.43,"location":2,"content":"a compact language model that would be easy for people to use,"},{"from":2001.43,"to":2007.39,"location":2,"content":"um, in other tasks even if they don't have the beefiest computer hardware in the world."},{"from":2007.39,"to":2009.94,"location":2,"content":"And so they decided to dispense with having"},{"from":2009.94,"to":2014.18,"location":2,"content":"word representations altogether and just use, um,"},{"from":2014.18,"to":2018.61,"location":2,"content":"character CNNs to build word representations,"},{"from":2018.61,"to":2022.05,"location":2,"content":"because that lessens the number of parameters you have to store,"},{"from":2022.05,"to":2025.51,"location":2,"content":"the big matrices you have to, um, use."},{"from":2025.51,"to":2030.28,"location":2,"content":"Um, they expanded the hidden dimension to 4,096,"},{"from":2030.28,"to":2032.02,"location":2,"content":"but then they project it down to"},{"from":2032.02,"to":2037.45,"location":2,"content":"512 dimensions with a sort of feed-forward projection layer,"},{"from":2037.45,"to":2040.3,"location":2,"content":"and that's a fairly common technique to again reduce"},{"from":2040.3,"to":2043.36,"location":2,"content":"the parameterization of the model so that you have a lot of"},{"from":2043.36,"to":2046.06,"location":2,"content":"parameters going in their current direction but you"},{"from":2046.06,"to":2049.32,"location":2,"content":"need much smaller matrices for including,"},{"from":2049.32,"to":2051.4,"location":2,"content":"um, the input at the next level."},{"from":2051.4,"to":2053.53,"location":2,"content":"Um, between the layers,"},{"from":2053.53,"to":2058.3,"location":2,"content":"they now use a residual connection and they do a bit of parameter tying."},{"from":2058.3,"to":2061.61,"location":2,"content":"So it's sort of all in the little details there."},{"from":2061.61,"to":2065.2,"location":2,"content":"Um, but there's another interesting thing"},{"from":2065.2,"to":2068.89,"location":2,"content":"that they did which was an important innovation of ELMo,"},{"from":2068.89,"to":2070.41,"location":2,"content":"so we should get this bit."},{"from":2070.41,"to":2072.4,"location":2,"content":"So in TagLM,"},{"from":2072.4,"to":2076.93,"location":2,"content":"what was fed from the pre-trained LM into"},{"from":2076.93,"to":2083.7,"location":2,"content":"the main model was just the top level of the neural language model stack,"},{"from":2083.7,"to":2087.04,"location":2,"content":"and that was completely standard de rigueur in those days,"},{"from":2087.04,"to":2089.8,"location":2,"content":"that you might have had three layers of"},{"from":2089.8,"to":2093.79,"location":2,"content":"neural language model that you regard at the top-level as your sort"},{"from":2093.79,"to":2097.12,"location":2,"content":"of one that's really captured the meaning of"},{"from":2097.12,"to":2101.18,"location":2,"content":"the sentence and the lower layers for processing that led up to it."},{"from":2101.18,"to":2105.3,"location":2,"content":"Um, and they had the idea that maybe"},{"from":2105.3,"to":2109.78,"location":2,"content":"it would be useful to actually use all layers of the,"},{"from":2109.78,"to":2112.96,"location":2,"content":"biLSTM of the neural language models."},{"from":2112.96,"to":2116.93,"location":2,"content":"So maybe not just the top layer but all layers would be kind of useful."},{"from":2116.93,"to":2120.76,"location":2,"content":"So, um, there are these kind of complex equations,"},{"from":2120.76,"to":2124.48,"location":2,"content":"uh, but essentially the point of it over here is,"},{"from":2124.48,"to":2127.36,"location":2,"content":"we going- for a particular position,"},{"from":2127.36,"to":2129.51,"location":2,"content":"word seven in the language model,"},{"from":2129.51,"to":2133.93,"location":2,"content":"we're going to take the hidden state at each level of our,"},{"from":2133.93,"to":2136.6,"location":2,"content":"our neural language model stack,"},{"from":2136.6,"to":2140.55,"location":2,"content":"we're going to give- learn a weight for that level,"},{"from":2140.55,"to":2142.54,"location":2,"content":"we go in to sort of sum them,"},{"from":2142.54,"to":2147.19,"location":2,"content":"so this is sort of a weighted average of the hidden layers at each position,"},{"from":2147.19,"to":2151.22,"location":2,"content":"and that will be used as our basic representation."},{"from":2151.22,"to":2155.78,"location":2,"content":"Um, and so, they found that that gave quite a bit"},{"from":2155.78,"to":2160.48,"location":2,"content":"of extra usefulness for- and different tasks could prefer different layers."},{"from":2160.48,"to":2163.05,"location":2,"content":"There's one other bit here which is,"},{"from":2163.05,"to":2168.63,"location":2,"content":"they learn a global scaling factor Gamma for a particular task."},{"from":2168.63,"to":2173.66,"location":2,"content":"And this allows them to control that for some tasks, the, um,"},{"from":2173.66,"to":2176.08,"location":2,"content":"contextual word embeddings might be really"},{"from":2176.08,"to":2179.51,"location":2,"content":"useful and for other tasks they might not be so useful,"},{"from":2179.51,"to":2181.45,"location":2,"content":"so you're just sort of learning a specific,"},{"from":2181.45,"to":2185.09,"location":2,"content":"um, usefulness for the entire task."},{"from":2185.09,"to":2190.28,"location":2,"content":"Okay. So, um, that's the sort of new version of language model."},{"from":2190.28,"to":2193.39,"location":2,"content":"But this, this is allowing this idea of well,"},{"from":2193.39,"to":2196.75,"location":2,"content":"maybe there's sort of more syntactic meanings"},{"from":2196.75,"to":2199.86,"location":2,"content":"of a word and more semantic meanings of a word,"},{"from":2199.86,"to":2203.38,"location":2,"content":"possibly those could be represented at different layers of"},{"from":2203.38,"to":2205.51,"location":2,"content":"your neural language model and then for"},{"from":2205.51,"to":2208.33,"location":2,"content":"different tasks you can differentially weight them."},{"from":2208.33,"to":2211.33,"location":2,"content":"Um, so that's the basic model."},{"from":2211.33,"to":2216.85,"location":2,"content":"So you run your biLSTM before to g et representations of each word."},{"from":2216.85,"to":2219.61,"location":2,"content":"And then the generic ELMo recipe was,"},{"from":2219.61,"to":2223.22,"location":2,"content":"well, with that frozen language model,"},{"from":2223.22,"to":2228.54,"location":2,"content":"you want to feed it into some supervised model depending on what the task was,"},{"from":2228.54,"to":2230.07,"location":2,"content":"and they sort of say in the paper, well,"},{"from":2230.07,"to":2232.5,"location":2,"content":"how you do this maybe depends on the task."},{"from":2232.5,"to":2235.97,"location":2,"content":"You might want to kind of concatenate it to the intermediate layer,"},{"from":2235.97,"to":2237.66,"location":2,"content":"just as the TagLM did,"},{"from":2237.66,"to":2239.09,"location":2,"content":"that might be fine."},{"from":2239.09,"to":2242.22,"location":2,"content":"But you know it might also be useful to make use of"},{"from":2242.22,"to":2245.7,"location":2,"content":"these ELMo representations when producing outputs,"},{"from":2245.7,"to":2248.91,"location":2,"content":"so if you're doing something like a"},{"from":2248.91,"to":2255.21,"location":2,"content":"generation system or you might just sort of feed in the ELMo representation again,"},{"from":2255.21,"to":2258.63,"location":2,"content":"be- before you sort of do the softmax to find the output,"},{"from":2258.63,"to":2261.58,"location":2,"content":"they sort of left it flexible as to how it was used,"},{"from":2261.58,"to":2262.96,"location":2,"content":"but the general picture,"},{"from":2262.96,"to":2265.96,"location":2,"content":"you know, was kinda like we saw before."},{"from":2265.96,"to":2269.59,"location":2,"content":"Indeed I'm reusing the same picture that you've calculated"},{"from":2269.59,"to":2274.11,"location":2,"content":"an ELMo representation for each position as a weighted average,"},{"from":2274.11,"to":2277.36,"location":2,"content":"and then you're sort of concatenating that to the hidden state of"},{"from":2277.36,"to":2281.13,"location":2,"content":"your supervised system and generating your output."},{"from":2281.13,"to":2284.89,"location":2,"content":"And anyway, um, one way or another,"},{"from":2284.89,"to":2287.92,"location":2,"content":"um, they were able to do this, uh,"},{"from":2287.92,"to":2291.93,"location":2,"content":"and that with the little improvements that gave them about an extra"},{"from":2291.93,"to":2296.77,"location":2,"content":"0.3 percent in Named Entity Recognition."},{"from":2296.77,"to":2301.16,"location":2,"content":"Um, now, that sort of sounds like not very much."},{"from":2301.16,"to":2306.05,"location":2,"content":"And you might conclude from this why the excitement [LAUGHTER] and,"},{"from":2306.05,"to":2308.7,"location":2,"content":"you know, in some sense, um,"},{"from":2308.7,"to":2313.72,"location":2,"content":"that's right because sort of to the extent that there was an interesting idea here really"},{"from":2313.72,"to":2319.06,"location":2,"content":"that come up with it for the TagLM paper which gave a much better gain."},{"from":2319.06,"to":2325.25,"location":2,"content":"But, you know, why everyone got really excited was that in the ELMo paper,"},{"from":2325.25,"to":2328.03,"location":2,"content":"they then showed this isn't something that you can"},{"from":2328.03,"to":2330.91,"location":2,"content":"do one-off to improve a Named Entity Recognizer,"},{"from":2330.91,"to":2338.03,"location":2,"content":"you can take these ELMo representations and use them for pretty much any NLP task,"},{"from":2338.03,"to":2341.7,"location":2,"content":"and they can be very useful and give good gains."},{"from":2341.7,"to":2348.34,"location":2,"content":"And so, essentially why people got excited was because of the data that's in this table."},{"from":2348.34,"to":2351.25,"location":2,"content":"So here we're taking a whole bunch of very different tasks,"},{"from":2351.25,"to":2353.62,"location":2,"content":"so there's SQuAD question-answering, uh,"},{"from":2353.62,"to":2356.38,"location":2,"content":"there's natural language inference,"},{"from":2356.38,"to":2358.34,"location":2,"content":"there's semantic role labeling,"},{"from":2358.34,"to":2363.76,"location":2,"content":"there's co-reference, the Named Entity Recognition, doing sentiment analysis,"},{"from":2363.76,"to":2366.73,"location":2,"content":"so a wide range of different NLP tasks,"},{"from":2366.73,"to":2370.32,"location":2,"content":"and they have a previous state of the art system."},{"from":2370.32,"to":2374.86,"location":2,"content":"They produced their own baseline um, which is,"},{"from":2374.86,"to":2380.08,"location":2,"content":"you know, commonly sort of similar to the previous state of the art,"},{"from":2380.08,"to":2383.62,"location":2,"content":"but usually actually a bit worse than"},{"from":2383.62,"to":2385.36,"location":2,"content":"the current state of the art because it's"},{"from":2385.36,"to":2388.32,"location":2,"content":"whatever simpler cleaner system that they came up with,"},{"from":2388.32,"to":2391.34,"location":2,"content":"but then they could say in each case,"},{"from":2391.34,"to":2395.26,"location":2,"content":"oh, just take this system and add"},{"from":2395.26,"to":2399.99,"location":2,"content":"ELMo vectors into the hidden representations in the middle,"},{"from":2399.99,"to":2402.04,"location":2,"content":"and have those help you predict."},{"from":2402.04,"to":2404.71,"location":2,"content":"And in general, in all cases,"},{"from":2404.71,"to":2408.97,"location":2,"content":"that's giving you about a three percent or so gain absolute"},{"from":2408.97,"to":2413.47,"location":2,"content":"which was then producing this huge performance increase,"},{"from":2413.47,"to":2418.45,"location":2,"content":"which in all cases was moving the performance well above the previous,"},{"from":2418.45,"to":2420.04,"location":2,"content":"um, state of the art system."},{"from":2420.04,"to":2424,"location":2,"content":"So you know, this sort of then made it seem like magic pixie dust,"},{"from":2424,"to":2428.05,"location":2,"content":"because, you know, in the stakes of NLP conference land, you know,"},{"from":2428.05,"to":2430.96,"location":2,"content":"a lot of people use to try and to come up"},{"from":2430.96,"to":2434.5,"location":2,"content":"with a paper for the next year that's one percent better"},{"from":2434.5,"to":2437.08,"location":2,"content":"on one task and writing it up and that's"},{"from":2437.08,"to":2441.72,"location":2,"content":"their big breakthrough for the year to get their new paper out."},{"from":2441.72,"to":2444.36,"location":2,"content":"And the idea that there's just well this set of"},{"from":2444.36,"to":2448.05,"location":2,"content":"this way of creating context sensitive, um,"},{"from":2448.05,"to":2451.66,"location":2,"content":"word representations and you just use them in any task,"},{"from":2451.66,"to":2455.24,"location":2,"content":"and they'll give you around three percent and take you past the state of the art,"},{"from":2455.24,"to":2458.39,"location":2,"content":"this seemed like it was really great stuff."},{"from":2458.39,"to":2461.8,"location":2,"content":"And so people got very excited about this and that won"},{"from":2461.8,"to":2466.39,"location":2,"content":"the Best Paper Award at the NAACL 2018 conference."},{"from":2466.39,"to":2470.59,"location":2,"content":"Ah, and then, a- as I sort of vaguely mentioned,"},{"from":2470.59,"to":2474.37,"location":2,"content":"um, so the model that they actually used wasn't a deep stack,"},{"from":2474.37,"to":2477.52,"location":2,"content":"there were actually only two layers of biLSTMs,"},{"from":2477.52,"to":2482.62,"location":2,"content":"but they do show this interesting result that the lower level better captures"},{"from":2482.62,"to":2486.79,"location":2,"content":"low-level syntax word properties"},{"from":2486.79,"to":2490.39,"location":2,"content":"and its most useful things like part-of-speech tagging, syntactic"},{"from":2490.39,"to":2493.21,"location":2,"content":"dependencies, NER, where the top layer of"},{"from":2493.21,"to":2495.31,"location":2,"content":"their language model is better for"},{"from":2495.31,"to":2498.94,"location":2,"content":"higher level semantics that is more useful for things like sentiments,"},{"from":2498.94,"to":2502.49,"location":2,"content":"semantic role labeling and question answering."},{"from":2502.49,"to":2505.15,"location":2,"content":"Um, so that seemed interesting,"},{"from":2505.15,"to":2507.94,"location":2,"content":"though it'll actually be interesting to see how that panned"},{"from":2507.94,"to":2512.1,"location":2,"content":"out more if you had sort of more layers to play with."},{"from":2512.1,"to":2515.88,"location":2,"content":"Okay. ELMo, done."},{"from":2515.88,"to":2518.59,"location":2,"content":"Um, so I'm moving right ahead."},{"from":2518.59,"to":2525.55,"location":2,"content":"Um, here's something else that I just thought I should mention a little bit about,"},{"from":2525.55,"to":2529.27,"location":2,"content":"another piece of work that came out around the same time,"},{"from":2529.27,"to":2532.45,"location":2,"content":"a few months later maybe or maybe not,"},{"from":2532.45,"to":2534.43,"location":2,"content":"came out around the same time, uh,"},{"from":2534.43,"to":2538.42,"location":2,"content":"in, in 2018, was this work on"},{"from":2538.42,"to":2543.03,"location":2,"content":"Universal Language Model Fine-tuning for text classification,"},{"from":2543.03,"to":2545.99,"location":2,"content":"um, or ULMfit, by Howard and Ruder."},{"from":2545.99,"to":2551.34,"location":2,"content":"And essentially this had the same general idea of saying, Well,"},{"from":2551.34,"to":2560.56,"location":2,"content":"what we want to do is transfer learning where we could learn a big language model, um."},{"from":2560.56,"to":2563.07,"location":2,"content":"A big language model,"},{"from":2563.07,"to":2568.22,"location":2,"content":"and then for our target task which might be named entity recognition."},{"from":2568.22,"to":2570.2,"location":2,"content":"But here's text classification,"},{"from":2570.2,"to":2575.69,"location":2,"content":"we can transfer this language model information and help us to do better with the task."},{"from":2575.69,"to":2578.69,"location":2,"content":"And so, they proposed an architecture to do that."},{"from":2578.69,"to":2580.64,"location":2,"content":"And so, their architecture was,"},{"from":2580.64,"to":2587.96,"location":2,"content":"you have a big unsupervised corpus from which you train a neural language model."},{"from":2587.96,"to":2592.78,"location":2,"content":"They used the deeper neural language model with three hidden layers."},{"from":2592.78,"to":2594.92,"location":2,"content":"Um, you then fine tune"},{"from":2594.92,"to":2599.66,"location":2,"content":"your neural language model on the actual domain that you're interested in working in."},{"from":2599.66,"to":2602.26,"location":2,"content":"So, this was sort of an extra stage that they did."},{"from":2602.26,"to":2604.73,"location":2,"content":"And then finally, um,"},{"from":2604.73,"to":2608.96,"location":2,"content":"you now introduce your classification objectives."},{"from":2608.96,"to":2611.93,"location":2,"content":"So, what they're going to be doing is making text classifiers."},{"from":2611.93,"to":2613.53,"location":2,"content":"So, we're now wanting to,"},{"from":2613.53,"to":2619.28,"location":2,"content":"take this model and turn it from a language model into a text classifier."},{"from":2619.28,"to":2622.34,"location":2,"content":"Um, but there's something that they did differently, um,"},{"from":2622.34,"to":2623.72,"location":2,"content":"which is in some sense,"},{"from":2623.72,"to":2626.84,"location":2,"content":"foreshadows the later work in transformers."},{"from":2626.84,"to":2632.21,"location":2,"content":"So, rather than just feeding features from this into a completely different network,"},{"from":2632.21,"to":2638.71,"location":2,"content":"they keep using the same network but they introduce a different objective at the top."},{"from":2638.71,"to":2641.71,"location":2,"content":"So, one thing you could do with this network is use"},{"from":2641.71,"to":2645.01,"location":2,"content":"it to predict the next word as a language model."},{"from":2645.01,"to":2646.46,"location":2,"content":"And so at this point,"},{"from":2646.46,"to":2649.82,"location":2,"content":"they freeze the parameters of that softmax at the top,"},{"from":2649.82,"to":2651.45,"location":2,"content":"that's why it's shown in black."},{"from":2651.45,"to":2654.93,"location":2,"content":"Um, but instead, they could stick on"},{"from":2654.93,"to":2659.82,"location":2,"content":"a different prediction unit where it's predicting stuff for a particular task."},{"from":2659.82,"to":2661.61,"location":2,"content":"So, it might be predicting"},{"from":2661.61,"to":2666.68,"location":2,"content":"positive or negative sentiment in a text classification task or something like that."},{"from":2666.68,"to":2667.76,"location":2,"content":"So, in their model,"},{"from":2667.76,"to":2671.91,"location":2,"content":"they're sort of reusing the same network but sticking on the top of that,"},{"from":2671.91,"to":2676.2,"location":2,"content":"a different layer, to do the new classification task."},{"from":2676.2,"to":2679.7,"location":2,"content":"Um, they were also interested in something small,"},{"from":2679.7,"to":2683.61,"location":2,"content":"the sort of one GPU model of research, um,"},{"from":2683.61,"to":2687.62,"location":2,"content":"the paper has a lot of detail, the sort of tricks"},{"from":2687.62,"to":2692.15,"location":2,"content":"and care and feeding of your neural models to maximize performance."},{"from":2692.15,"to":2696.24,"location":2,"content":"If you're interested in that, you could sort of look up some of the details about that."},{"from":2696.24,"to":2700.25,"location":2,"content":"Um, but what they were able to show again,"},{"from":2700.25,"to":2703.82,"location":2,"content":"was making use of this language model pre-training was"},{"from":2703.82,"to":2707.49,"location":2,"content":"a very effective way to improve performance,"},{"from":2707.49,"to":2709.86,"location":2,"content":"this time for text classification."},{"from":2709.86,"to":2712.52,"location":2,"content":"So, these are text classification datasets,"},{"from":2712.52,"to":2714.26,"location":2,"content":"IMDb is for sentiment,"},{"from":2714.26,"to":2718.97,"location":2,"content":"um, TREC is for topical text classification, and again,"},{"from":2718.97,"to":2722.78,"location":2,"content":"there are preceding systems that other people have developed and they"},{"from":2722.78,"to":2726.62,"location":2,"content":"are showing that by making use of this language model pre-training,"},{"from":2726.62,"to":2731.39,"location":2,"content":"they're able to significantly improve on the state of the art of these error rates,"},{"from":2731.39,"to":2733.9,"location":2,"content":"so that low is good."},{"from":2733.9,"to":2739.72,"location":2,"content":"They also showed another interesting result which is kind of,"},{"from":2739.72,"to":2744.39,"location":2,"content":"um, what you would expect or hope from doing this kind of transfer learning,"},{"from":2744.39,"to":2746.33,"location":2,"content":"that what they were able to show is,"},{"from":2746.33,"to":2751.2,"location":2,"content":"if you can train this neural language model on a big amount of data,"},{"from":2751.2,"to":2754.43,"location":2,"content":"that that means you will then be able to do well on"},{"from":2754.43,"to":2759.11,"location":2,"content":"your supervised task even when trained on pretty little data."},{"from":2759.11,"to":2761.78,"location":2,"content":"Um, so, here this is error rate,"},{"from":2761.78,"to":2763.36,"location":2,"content":"so low is good."},{"from":2763.36,"to":2765.17,"location":2,"content":"So, what the- and here's the number of"},{"from":2765.17,"to":2768.82,"location":2,"content":"training examples which has being done on a log scale."},{"from":2768.82,"to":2771.71,"location":2,"content":"And so the blue line is if you're just training"},{"from":2771.71,"to":2775.73,"location":2,"content":"a text classifier from scratch on supervised data."},{"from":2775.73,"to":2779.76,"location":2,"content":"So, you need a lot of data to start to do pretty well."},{"from":2779.76,"to":2784.72,"location":2,"content":"Um, but if you're making use of this transfer learning, um,"},{"from":2784.72,"to":2787.89,"location":2,"content":"from a pre-trained language model,"},{"from":2787.89,"to":2790.31,"location":2,"content":"you can get to that you're sort of doing pretty"},{"from":2790.31,"to":2793.7,"location":2,"content":"well with way less, um, training examples."},{"from":2793.7,"to":2795.89,"location":2,"content":"Essentially, an order of magnitude,"},{"from":2795.89,"to":2799.66,"location":2,"content":"less training examples will give you the same amount of performance."},{"from":2799.66,"to":2804.02,"location":2,"content":"And the difference between these two lines corresponds to the extra,"},{"from":2804.02,"to":2808.67,"location":2,"content":"um, phase that they had in the middle of theirs, um, which is,"},{"from":2808.67,"to":2813.92,"location":2,"content":"whether you're doing this sort of extra fine tuning on your target domain,"},{"from":2813.92,"to":2818.69,"location":2,"content":"um, it's part of your process and they found that to be pretty helpful."},{"from":2818.69,"to":2825.22,"location":2,"content":"Okay. So, that, um, is another precursor."},{"from":2825.22,"to":2831.55,"location":2,"content":"Um, and so, one big part of what has happened since then,"},{"from":2831.55,"to":2835.82,"location":2,"content":"is effectively people said this is a good idea, uh,"},{"from":2835.82,"to":2841.91,"location":2,"content":"maybe it'll become a really really good idea if we just make things way bigger."},{"from":2841.91,"to":2844.25,"location":2,"content":"Um, so, ULMfit, um,"},{"from":2844.25,"to":2848.05,"location":2,"content":"was something that you could train in one GPU day,"},{"from":2848.05,"to":2851.87,"location":2,"content":"sounds appealing for CS224N final projects,"},{"from":2851.87,"to":2854.93,"location":2,"content":"remember that, um, and but well,"},{"from":2854.93,"to":2859.11,"location":2,"content":"then the people at OpenAI decided, well,"},{"from":2859.11,"to":2863.3,"location":2,"content":"we could build a pretrain language model and train it on"},{"from":2863.3,"to":2867.59,"location":2,"content":"a much larger amount of data on a much larger amount of compute,"},{"from":2867.59,"to":2874.13,"location":2,"content":"and use about 242 GPU days and that will get a lot better, and it did."},{"from":2874.13,"to":2877.19,"location":2,"content":"Um, and then the people at Google said,"},{"from":2877.19,"to":2880.45,"location":2,"content":"well we could train a model, um,"},{"from":2880.45,"to":2884.66,"location":2,"content":"in to 256 TPU days,"},{"from":2884.66,"to":2887.64,"location":2,"content":"which means maybe about double the amount of computation."},{"from":2887.64,"to":2889.57,"location":2,"content":"It's hard to figure out exactly,"},{"from":2889.57,"to":2892.18,"location":2,"content":"and that might be able to do exciting things,"},{"from":2892.18,"to":2894.95,"location":2,"content":"and that was the BERT model, and it did."},{"from":2894.95,"to":2898.37,"location":2,"content":"Um, and then if you're following along these things, um,"},{"from":2898.37,"to":2900.11,"location":2,"content":"just last week, um,"},{"from":2900.11,"to":2902.27,"location":2,"content":"the OpenAI people said,"},{"from":2902.27,"to":2906.84,"location":2,"content":"well we can go much bigger again and we can train a model, um,"},{"from":2906.84,"to":2912.83,"location":2,"content":"for approximately 2,000 TPU version three days."},{"from":2912.83,"to":2916.34,"location":2,"content":"Um, and it will be able to,"},{"from":2916.34,"to":2919.29,"location":2,"content":"um, do much bigger again,"},{"from":2919.29,"to":2921.08,"location":2,"content":"a bit much better again,"},{"from":2921.08,"to":2924.41,"location":2,"content":"um, and so, this is this GP2,"},{"from":2924.41,"to":2927.8,"location":2,"content":"GPT-2 language model, um,"},{"from":2927.8,"to":2930.68,"location":2,"content":"which OpenAI released last week."},{"from":2930.68,"to":2936.74,"location":2,"content":"Um, and they're, they're actually very impressive results, um,"},{"from":2936.74,"to":2940.73,"location":2,"content":"when they're showing that if you're sort of building a really,"},{"from":2940.73,"to":2945.16,"location":2,"content":"really huge language model over a very large amount of data."},{"from":2945.16,"to":2949.74,"location":2,"content":"And then you say language model go off and generate some text,"},{"from":2949.74,"to":2951.8,"location":2,"content":"on this particular topic,"},{"from":2951.8,"to":2955.1,"location":2,"content":"that it can actually just do a great job of producing text."},{"from":2955.1,"to":2957.13,"location":2,"content":"So, the way this was being do- done,"},{"from":2957.13,"to":2959.93,"location":2,"content":"was a humanist writing a couple of sentences;"},{"from":2959.93,"to":2961.19,"location":2,"content":"in a shocking finding,"},{"from":2961.19,"to":2963.51,"location":2,"content":"scientists discovered a herd of unicorns,"},{"from":2963.51,"to":2967.7,"location":2,"content":"living in remote previously unexplored valley in the Andes Mountains."},{"from":2967.7,"to":2969.91,"location":2,"content":"Um, and so, we then,"},{"from":2969.91,"to":2973.7,"location":2,"content":"using our neural language model and chugging through that,"},{"from":2973.7,"to":2975.68,"location":2,"content":"so that gives us context,"},{"from":2975.68,"to":2977.76,"location":2,"content":"and then say generate more text,"},{"from":2977.76,"to":2979.76,"location":2,"content":"and it starts to generate the scientist"},{"from":2979.76,"to":2982.16,"location":2,"content":"named the population after their distinctive horn,"},{"from":2982.16,"to":2984.32,"location":2,"content":"Ovid's Unicorn, these four-horned,"},{"from":2984.32,"to":2987.82,"location":2,"content":"silver-white Uni four corns were previously unknown to science."},{"from":2987.82,"to":2990.08,"location":2,"content":"Um, it produces remarkably,"},{"from":2990.08,"to":2992.74,"location":2,"content":"um, good text or at least in the,"},{"from":2992.74,"to":2997.22,"location":2,"content":"in the hand-picked examples [LAUGHTER] that they showed in the tech news,"},{"from":2997.22,"to":2999.92,"location":2,"content":"um, it produces extremely good text."},{"from":2999.92,"to":3004.96,"location":2,"content":"Um, yeah so, I think one should be a little bit cautious about, um,"},{"from":3004.96,"to":3007.93,"location":2,"content":"that and sort of some of its random outputs actually"},{"from":3007.93,"to":3010.9,"location":2,"content":"aren't nearly as good but nevertheless you know,"},{"from":3010.9,"to":3012.89,"location":2,"content":"I think is is actually dramatic"},{"from":3012.89,"to":3016.54,"location":2,"content":"how good language models are becoming once you are training"},{"from":3016.54,"to":3023.28,"location":2,"content":"them on long contexts as we can do with modern models on vast amounts of data, um-."},{"from":3023.28,"to":3027.43,"location":2,"content":"So then, um, the OpenAI people decided"},{"from":3027.43,"to":3031.72,"location":2,"content":"this language model was so good that they weren't gonna release it to the world, um,"},{"from":3031.72,"to":3034.48,"location":2,"content":"which then got transformed into headlines of,"},{"from":3034.48,"to":3039.26,"location":2,"content":"Elon Musk's OpenAI builds artificial intelligence so powerful,"},{"from":3039.26,"to":3041.98,"location":2,"content":"it must be kept locked up for the good of humanity."},{"from":3041.98,"to":3046.66,"location":2,"content":"[LAUGHTER] Um, with the suitable pictures that always turn off at"},{"from":3046.66,"to":3052.07,"location":2,"content":"these moments down the bottom of the screen, um, and,"},{"from":3052.07,"to":3057.52,"location":2,"content":"um, yeah I guess that was the leading even Elon Musk to be wanting to clarify and say"},{"from":3057.52,"to":3063.02,"location":2,"content":"that it's not actually really that he's directing what's happening at OpenAI anymore."},{"from":3063.02,"to":3066.36,"location":2,"content":"Um, anyway, moving right along."},{"from":3066.36,"to":3069.76,"location":2,"content":"Um, so, part of the story here is"},{"from":3069.76,"to":3074.64,"location":2,"content":"just a scaling thing that these things have been getting bigger and bigger,"},{"from":3074.64,"to":3078.76,"location":2,"content":"um, but the other part of the story is that all three of"},{"from":3078.76,"to":3083.78,"location":2,"content":"these are then systems that use the transformer architecture."},{"from":3083.78,"to":3087.7,"location":2,"content":"And transformer architectures have not only being very powerful,"},{"from":3087.7,"to":3092.57,"location":2,"content":"but technically had allowed scaling to much bigger sizes."},{"from":3092.57,"to":3095.57,"location":2,"content":"So to understand some of the rest of these, um,"},{"from":3095.57,"to":3099.05,"location":2,"content":"we should learn more about transformers."},{"from":3099.05,"to":3102.61,"location":2,"content":"And so, I'm sort of gonna do that, um,"},{"from":3102.61,"to":3106.49,"location":2,"content":"but I mean, um, in mix of orders,"},{"from":3106.49,"to":3110.2,"location":2,"content":"um, our invited speaker coming Thursday uh, is, um,"},{"from":3110.2,"to":3112.42,"location":2,"content":"one of the authors of the transformer paper,"},{"from":3112.42,"to":3114.49,"location":2,"content":"and he's gonna talk about transformers."},{"from":3114.49,"to":3117.43,"location":2,"content":"So I think what I'm gonna do is, um,"},{"from":3117.43,"to":3121,"location":2,"content":"say a little bit about transformers quickly,"},{"from":3121,"to":3124.09,"location":2,"content":"but not really dwell on all the details, um,"},{"from":3124.09,"to":3126.26,"location":2,"content":"but hope that it's a bit of an introduction,"},{"from":3126.26,"to":3130.36,"location":2,"content":"and you can find out more on Thursday about the details and"},{"from":3130.36,"to":3135.19,"location":2,"content":"then talk some more about the BERT model before finishing."},{"from":3135.19,"to":3139.45,"location":2,"content":"So the motivation for transformers is essentially"},{"from":3139.45,"to":3143.24,"location":2,"content":"we want things to go faster so we can build bigger models,"},{"from":3143.24,"to":3146.13,"location":2,"content":"and the problem as we mentioned for these, um,"},{"from":3146.13,"to":3151.06,"location":2,"content":"LSTM or in general any of the recurrent models is the fact that they're recurrent."},{"from":3151.06,"to":3156.19,"location":2,"content":"You have to generate sort of one to n status time chugging through,"},{"from":3156.19,"to":3161.28,"location":2,"content":"and that means you just can't do the same kind of parallel computation, um,"},{"from":3161.28,"to":3166.97,"location":2,"content":"that GPUs love that you can do in things like convolutional neural networks."},{"from":3166.97,"to":3168.86,"location":2,"content":"But, you know, on the other hand,"},{"from":3168.86,"to":3171.21,"location":2,"content":"we discovered that even though, um,"},{"from":3171.21,"to":3176.01,"location":2,"content":"these gated recurrent units like LSTMs and GRUs are great,"},{"from":3176.01,"to":3180.07,"location":2,"content":"that to get really great performance out of these recurrent models,"},{"from":3180.07,"to":3185.68,"location":2,"content":"we found that we wanted to- we had a problem within these long sequence lengths,"},{"from":3185.68,"to":3189.01,"location":2,"content":"and we can improve things by adding attention mechanisms."},{"from":3189.01,"to":3192.07,"location":2,"content":"And so that led to the idea of- well,"},{"from":3192.07,"to":3194.43,"location":2,"content":"since attention works so great,"},{"from":3194.43,"to":3197.44,"location":2,"content":"maybe we can just use attention,"},{"from":3197.44,"to":3202.2,"location":2,"content":"and we can actually get rid of the recurrent part of the model [NOISE] altogether."},{"from":3202.2,"to":3207.63,"location":2,"content":"And so that actually then leads to the idea of these transformer architectures,"},{"from":3207.63,"to":3212.55,"location":2,"content":"and the original paper on this is actually called attention is all you need,"},{"from":3212.55,"to":3216.7,"location":2,"content":"which reflects this idea of we're gonna keep the attention part,"},{"from":3216.7,"to":3220,"location":2,"content":"and we're getting- going to get rid of the, um,"},{"from":3220,"to":3223.96,"location":2,"content":"recurrent part, and we'll be able to build a great model."},{"from":3223.96,"to":3225.31,"location":2,"content":"So in the initial work,"},{"from":3225.31,"to":3228.79,"location":2,"content":"what they're doing is machine translation kind of like"},{"from":3228.79,"to":3232.72,"location":2,"content":"the Neural Machine Translation with attention we described,"},{"from":3232.72,"to":3236.18,"location":2,"content":"but what they're wanting to do is build"},{"from":3236.18,"to":3243.63,"location":2,"content":"a complex encoder and a complex decoder that works non-recurrently,"},{"from":3243.63,"to":3247.66,"location":2,"content":"and, um, nevertheless is able to translate sentences"},{"from":3247.66,"to":3253.07,"location":2,"content":"well by making use of lots of attention distributions."},{"from":3253.07,"to":3258.07,"location":2,"content":"And so, I wanted to say a little bit more quickly about that,"},{"from":3258.07,"to":3260.97,"location":2,"content":"and hopefully we'll get more of this on Thursday."},{"from":3260.97,"to":3264.68,"location":2,"content":"Um, first as a- as a recommended resource,"},{"from":3264.68,"to":3266.55,"location":2,"content":"if you wanna look at, um,"},{"from":3266.55,"to":3269.7,"location":2,"content":"home and learn more about, um,"},{"from":3269.7,"to":3274,"location":2,"content":"the transformer architecture, there's this really great, um,"},{"from":3274,"to":3279.1,"location":2,"content":"bit of work by Sasha Rush called The Annotated Transformer that goes through"},{"from":3279.1,"to":3285.03,"location":2,"content":"the entire transformer paper accompanied by PyTorch code in a Jupyter Notebook,"},{"from":3285.03,"to":3288.22,"location":2,"content":"and so that can actually be a really useful thing,"},{"from":3288.22,"to":3294.24,"location":2,"content":"but I'll go through a little bit of the basics now of how we do things."},{"from":3294.24,"to":3297.46,"location":2,"content":"So the basic idea, um,"},{"from":3297.46,"to":3303.39,"location":2,"content":"is that they're going to use attention everywhere to calculate things."},{"from":3303.39,"to":3307.54,"location":2,"content":"And, um, we talked before about the different kinds of"},{"from":3307.54,"to":3312.52,"location":2,"content":"attention of the sort of multiplicative by linear attention and the little,"},{"from":3312.52,"to":3315.49,"location":2,"content":"um, feed-forward network additive attention."},{"from":3315.49,"to":3318.67,"location":2,"content":"They kind of go for the simplest kind of attention,"},{"from":3318.67,"to":3323.03,"location":2,"content":"where the attention is just dot-products between two things."},{"from":3323.03,"to":3326.86,"location":2,"content":"Um, but they sort of do the more comp- for various purposes,"},{"from":3326.86,"to":3332.83,"location":2,"content":"they do the more complicated version of dot-product between two things where they have,"},{"from":3332.83,"to":3336.28,"location":2,"content":"um, when the- the things that they're looking up are"},{"from":3336.28,"to":3340.38,"location":2,"content":"assumed to be key-value pairs, keys and values,"},{"from":3340.38,"to":3346.76,"location":2,"content":"and so you're calculating the similarity as a dot-product between a query and the key,"},{"from":3346.76,"to":3348.41,"location":2,"content":"and then based on that,"},{"from":3348.41,"to":3352.06,"location":2,"content":"you're going to be using the vector for the corresponding value."},{"from":3352.06,"to":3355.8,"location":2,"content":"So our equation here for what we're calculating is where you are"},{"from":3355.8,"to":3360.13,"location":2,"content":"looking using the softmax over query, um,"},{"from":3360.13,"to":3363.61,"location":2,"content":"key similarities and using that to give"},{"from":3363.61,"to":3368.68,"location":2,"content":"the weightings as an attention based weighting over the corresponding values."},{"from":3368.68,"to":3372.22,"location":2,"content":"Um, so that's the basic attention model."},{"from":3372.22,"to":3375.99,"location":2,"content":"Um, so that add- saying it that way, um,"},{"from":3375.99,"to":3378.1,"location":2,"content":"adds a little bit of complexity,"},{"from":3378.1,"to":3381.14,"location":2,"content":"but sort of for the simplest part for their encoder."},{"from":3381.14,"to":3386.07,"location":2,"content":"Actually, all of the query keys and values are exactly the same."},{"from":3386.07,"to":3388.22,"location":2,"content":"They are the words, um,"},{"from":3388.22,"to":3392.62,"location":2,"content":"that they're using as their source language, um, things."},{"from":3392.62,"to":3398.34,"location":2,"content":"So, it sort of adds some complexity that isn't really there."},{"from":3398.34,"to":3402.28,"location":2,"content":"Um, okay. Um, I'll skip that."},{"from":3402.28,"to":3408.18,"location":2,"content":"Um, so, there are a couple of other things that they do."},{"from":3408.18,"to":3412.16,"location":2,"content":"One thing that they note is that, um,"},{"from":3412.16,"to":3417.74,"location":2,"content":"the- the values you get from, um, QTK, um,"},{"from":3417.74,"to":3423.28,"location":2,"content":"very, in variances the dimension gets large"},{"from":3423.28,"to":3428.23,"location":2,"content":"so that they sort of do some normalization by the size of the hidden state dimension,"},{"from":3428.23,"to":3432.28,"location":2,"content":"but I'll leave that out as well for details, right."},{"from":3432.28,"to":3433.95,"location":2,"content":"So in the encoder, um,"},{"from":3433.95,"to":3437.02,"location":2,"content":"everything is just our word vectors,"},{"from":3437.02,"to":3440.38,"location":2,"content":"there are the queries, the keys, and the values."},{"from":3440.38,"to":3443.78,"location":2,"content":"Um, and we're gonna use attention everywhere in the system."},{"from":3443.78,"to":3449.86,"location":2,"content":"Oops. Okay. So the second new idea is, well,"},{"from":3449.86,"to":3456.11,"location":2,"content":"attention is great but maybe it's bad if you only have one attention distribution,"},{"from":3456.11,"to":3459.19,"location":2,"content":"because you're gonna only attend to things one way."},{"from":3459.19,"to":3462.41,"location":2,"content":"Maybe for various users it would be great"},{"from":3462.41,"to":3465.76,"location":2,"content":"if you could attend from one position to various things."},{"from":3465.76,"to":3471.19,"location":2,"content":"So, if you're thinking about syntax and what we did with dependency parsers."},{"from":3471.19,"to":3474.97,"location":2,"content":"If you're a word, you might want to attend to your headword,"},{"from":3474.97,"to":3479.16,"location":2,"content":"but you might also wanna attend- attend to your dependent words."},{"from":3479.16,"to":3481.69,"location":2,"content":"And if you happen to be a pronoun,"},{"from":3481.69,"to":3486.01,"location":2,"content":"you might want to attend to what the pronoun refers to you."},{"from":3486.01,"to":3487.86,"location":2,"content":"You might want to have lots of attention."},{"from":3487.86,"to":3492.01,"location":2,"content":"So they introduced this idea of multi-head attention."},{"from":3492.01,"to":3496.36,"location":2,"content":"And so what you're doing with multi-head attention is you have,"},{"from":3496.36,"to":3498.13,"location":2,"content":"um, your hidden states,"},{"from":3498.13,"to":3500.17,"location":2,"content":"um, in your system,"},{"from":3500.17,"to":3503.8,"location":2,"content":"and you map them via projection layers, um,"},{"from":3503.8,"to":3507.67,"location":2,"content":"which are just multiplications by different W matrices as"},{"from":3507.67,"to":3512.35,"location":2,"content":"linear projections into sort of different lower dimensional spaces,"},{"from":3512.35,"to":3517.03,"location":2,"content":"and then you use each of those to calculate dot-product attention,"},{"from":3517.03,"to":3520.27,"location":2,"content":"and so you can attend to different things at the same time."},{"from":3520.27,"to":3522.67,"location":2,"content":"And this multi-head attention was one of"},{"from":3522.67,"to":3528.66,"location":2,"content":"the very successful ideas of transformers that made them a more powerful architecture."},{"from":3528.66,"to":3534.72,"location":2,"content":"Okay. Um, so, then for our complete transformer block,"},{"from":3534.72,"to":3540.51,"location":2,"content":"it's sort of then starting to build complex architectures like we sort of started seeing,"},{"from":3540.51,"to":3542.2,"location":2,"content":"um, the other week."},{"from":3542.2,"to":3545.32,"location":2,"content":"Um, so- okay."},{"from":3545.32,"to":3546.97,"location":2,"content":"Yeah. So, starting,"},{"from":3546.97,"to":3550.06,"location":2,"content":"um, from our word vectors,"},{"from":3550.06,"to":3556.91,"location":2,"content":"we're kind of going to do attention to multiple different things,"},{"from":3556.91,"to":3559.9,"location":2,"content":"um, and we're simultaneously gonna have"},{"from":3559.9,"to":3563.53,"location":2,"content":"a residual connection that short-circuits around them."},{"from":3563.53,"to":3568.05,"location":2,"content":"Um, we're then going to sort of sum the two of these,"},{"from":3568.05,"to":3573.11,"location":2,"content":"and then they're going to do a normalization at that point."},{"from":3573.11,"to":3576.4,"location":2,"content":"Um, I talked previously about batch normalization,"},{"from":3576.4,"to":3578.02,"location":2,"content":"they don't do batch normalization,"},{"from":3578.02,"to":3581.2,"location":2,"content":"they do another variant which is layer normalization,"},{"from":3581.2,"to":3583.86,"location":2,"content":"which is a different way of doing normalization,"},{"from":3583.86,"to":3585.63,"location":2,"content":"but I'll skip that for now."},{"from":3585.63,"to":3589,"location":2,"content":"And then they sort of for one transformer block,"},{"from":3589,"to":3592.05,"location":2,"content":"you then go after the multi-head attention,"},{"from":3592.05,"to":3596.76,"location":2,"content":"you put things through a feed-forward layer which also has a residual connection,"},{"from":3596.76,"to":3598.81,"location":2,"content":"you sum the output of those,"},{"from":3598.81,"to":3603.79,"location":2,"content":"and you then again do another, um, layer normalization."},{"from":3603.79,"to":3608.97,"location":2,"content":"So this is the basic transformer block that they're gonna use everywhere."},{"from":3608.97,"to":3611.32,"location":2,"content":"And to make their complete architectures,"},{"from":3611.32,"to":3613.21,"location":2,"content":"they're then gonna sort of start stacking"},{"from":3613.21,"to":3617.05,"location":2,"content":"these transformer blocks to produce a very deep network."},{"from":3617.05,"to":3618.16,"location":2,"content":"And in some sense,"},{"from":3618.16,"to":3622.78,"location":2,"content":"what has been found is that transformers performed very well."},{"from":3622.78,"to":3625,"location":2,"content":"But, you know, there's no free lunch,"},{"from":3625,"to":3626.44,"location":2,"content":"um, you kind of can't."},{"from":3626.44,"to":3628.15,"location":2,"content":"You're- now, no longer getting"},{"from":3628.15,"to":3631.45,"location":2,"content":"recurrent information actually being carried along a sequence."},{"from":3631.45,"to":3636.28,"location":2,"content":"You've got a word at some position which can be casting attention,"},{"from":3636.28,"to":3638.03,"location":2,"content":"uh, on other words."},{"from":3638.03,"to":3641.56,"location":2,"content":"So if you'd like to have information carried along in a chain,"},{"from":3641.56,"to":3644.98,"location":2,"content":"you've sort of first of all gotta walk the first step of the chain,"},{"from":3644.98,"to":3646.69,"location":2,"content":"and then you need to have another layer"},{"from":3646.69,"to":3649.69,"location":2,"content":"vertically which can walk the next step of the chain,"},{"from":3649.69,"to":3653.8,"location":2,"content":"and then you need to have another layer vertically that walks the next step of the chain."},{"from":3653.8,"to":3657.52,"location":2,"content":"So, you're getting rid of the recurrence along the sequence,"},{"from":3657.52,"to":3663.22,"location":2,"content":"but you're substituting some depth to allow things to walk along multiple hops."},{"from":3663.22,"to":3667.89,"location":2,"content":"But nevertheless, that's highly advantageous in GPU architectures"},{"from":3667.89,"to":3673.3,"location":2,"content":"because it allows you to use parallelization to calculate everything at each,"},{"from":3673.3,"to":3679.29,"location":2,"content":"um, depth at the same time. Um."},{"from":3679.29,"to":3682.9,"location":2,"content":"Maybe I'll go light on explaining this as well."},{"from":3682.9,"to":3685.42,"location":2,"content":"Um, so they use byte-pair encodings."},{"from":3685.42,"to":3687.49,"location":2,"content":"But if you do nothing else,"},{"from":3687.49,"to":3690.85,"location":2,"content":"you just have words fed in this word vectors and you have"},{"from":3690.85,"to":3694.76,"location":2,"content":"no idea whether you're at the beginning of the sentence or at the end of the sentence."},{"from":3694.76,"to":3698.68,"location":2,"content":"Though, they have a message of- method of doing positional encoding which gives"},{"from":3698.68,"to":3702.86,"location":2,"content":"you some ideas to pro- position your word has in the sentence."},{"from":3702.86,"to":3707.95,"location":2,"content":"Okay. Um, so that's sort of the, um, encoder system."},{"from":3707.95,"to":3709.54,"location":2,"content":"So from the words,"},{"from":3709.54,"to":3711.55,"location":2,"content":"they have an initial word embedding,"},{"from":3711.55,"to":3714.09,"location":2,"content":"you add in their positional encoding,"},{"from":3714.09,"to":3718.11,"location":2,"content":"you go into one of these transformer blocks,"},{"from":3718.11,"to":3721.03,"location":2,"content":"and you then repeat it n times."},{"from":3721.03,"to":3723.84,"location":2,"content":"So you'll have a stack of these transformer blocks."},{"from":3723.84,"to":3726.78,"location":2,"content":"So you're multiple times doing, um,"},{"from":3726.78,"to":3731.59,"location":2,"content":"multi-head attention to other parts of the sentence, calculating values,"},{"from":3731.59,"to":3732.94,"location":2,"content":"feeding forward a value,"},{"from":3732.94,"to":3734.86,"location":2,"content":"putting it through a fully-connected layer,"},{"from":3734.86,"to":3739.74,"location":2,"content":"and then you just sort of repeat, do attention to different places in the sentence."},{"from":3739.74,"to":3741.31,"location":2,"content":"Get all your information,"},{"from":3741.31,"to":3743.28,"location":2,"content":"put it through a fully connected layer,"},{"from":3743.28,"to":3746.76,"location":2,"content":"and go up, um, proceeding up deeply."},{"from":3746.76,"to":3751,"location":2,"content":"And and that sounds a little mysterious,"},{"from":3751,"to":3754.22,"location":2,"content":"but it turns out to work just great."},{"from":3754.22,"to":3756.6,"location":2,"content":"And the way to think about,"},{"from":3756.6,"to":3759.9,"location":2,"content":"I think is that at each stage,"},{"from":3759.9,"to":3764.76,"location":2,"content":"you can look with your multi-headed attention and various other places in the sentence,"},{"from":3764.76,"to":3768.21,"location":2,"content":"accumulate information, push it up to the next layer."},{"from":3768.21,"to":3771.26,"location":2,"content":"And if you do that sort of half a dozen times,"},{"from":3771.26,"to":3775.53,"location":2,"content":"you can be starting to progressively push information along"},{"from":3775.53,"to":3781.45,"location":2,"content":"the sequence in either direction to calculate values that are of interest."},{"from":3781.45,"to":3788.61,"location":2,"content":"Um, and the interesting thing is that these models turn out to work"},{"from":3788.61,"to":3795.97,"location":2,"content":"really well at sort of learning to attend the interesting things in linguistic structure."},{"from":3795.97,"to":3799.81,"location":2,"content":"Um, so these are just sort of suggestive diagrams,"},{"from":3799.81,"to":3804.19,"location":2,"content":"but this is looking at layer five of the transformer stack and"},{"from":3804.19,"to":3808.95,"location":2,"content":"seeing what words are being attended to by different attention heads."},{"from":3808.95,"to":3813.01,"location":2,"content":"So these different colors correspond to different attention heads."},{"from":3813.01,"to":3815.05,"location":2,"content":"And so the sentence is,"},{"from":3815.05,"to":3819.01,"location":2,"content":"um, it is, \"In this spirit,"},{"from":3819.01,"to":3822.31,"location":2,"content":"that a majority of American governments have passed new laws since"},{"from":3822.31,"to":3827.06,"location":2,"content":"2009 making the registration or voting process more difficult.\""},{"from":3827.06,"to":3833.28,"location":2,"content":"And so what we see is sort of most of the attention heads,"},{"from":3833.28,"to":3838.84,"location":2,"content":"uh, looking from making to making more difficult and that seems to be useful."},{"from":3838.84,"to":3843.7,"location":2,"content":"One of the attention heads seems to be looking at the word itself might be okay."},{"from":3843.7,"to":3850.57,"location":2,"content":"Um, then the other ones are sort of looking a bit at laws and at 2009."},{"from":3850.57,"to":3854.53,"location":2,"content":"So it's sort of picking out the arguments, um,"},{"from":3854.53,"to":3858.91,"location":2,"content":"and modifiers and making in a syntax kind of like way."},{"from":3858.91,"to":3861.88,"location":2,"content":"Um, interestingly, for pronouns,"},{"from":3861.88,"to":3866.77,"location":2,"content":"attention heads appear to learn to be able to look back to reference."},{"from":3866.77,"to":3868.8,"location":2,"content":"So the law will never be perfect,"},{"from":3868.8,"to":3875.18,"location":2,"content":"but its application should be just that one attention head it for its,"},{"from":3875.18,"to":3879.05,"location":2,"content":"is looking at what its is modifying in the application."},{"from":3879.05,"to":3880.93,"location":2,"content":"But another attention head,"},{"from":3880.93,"to":3885.64,"location":2,"content":"the its is looking strongly at what its refers back to as the law."},{"from":3885.64,"to":3887.74,"location":2,"content":"So that seems kind of cool."},{"from":3887.74,"to":3889.81,"location":2,"content":"Um, yeah."},{"from":3889.81,"to":3892.87,"location":2,"content":"Um, okay."},{"from":3892.87,"to":3896.03,"location":2,"content":"And so then, for the rest of the model, um,"},{"from":3896.03,"to":3898.99,"location":2,"content":"there's then some more complexity for how to use"},{"from":3898.99,"to":3905.02,"location":2,"content":"the transformers decoder to give you a full neural machine translation system."},{"from":3905.02,"to":3908.77,"location":2,"content":"But I think maybe I will skip that and go"},{"from":3908.77,"to":3913.75,"location":2,"content":"on and say a bit about BERT in my remaining minutes."},{"from":3913.75,"to":3918.49,"location":2,"content":"Okay. So, um, the latest and greatest contextual"},{"from":3918.49,"to":3923.59,"location":2,"content":"word representations to help you flow your tasks have been these BERT vectors,"},{"from":3923.59,"to":3929.97,"location":2,"content":"where BERT is Bidirectional Encoder Representations from Transformers."},{"from":3929.97,"to":3935.09,"location":2,"content":"And so essentially, it's using the encoder from a transformer network."},{"from":3935.09,"to":3940.2,"location":2,"content":"Uh, this deep multi-headed attention stack to calculate, um,"},{"from":3940.2,"to":3943.61,"location":2,"content":"a representation of a sentence and saying,"},{"from":3943.61,"to":3949.75,"location":2,"content":"\"That's a great all-purpose representation of a sentence that you can use for tasks."},{"from":3949.75,"to":3954.05,"location":2,"content":"Be it named entity recognition or SQuAD question answering.\""},{"from":3954.05,"to":3959.32,"location":2,"content":"And so there's actually an interesting new idea that these people had."},{"from":3959.32,"to":3964.99,"location":2,"content":"And that well, their idea was well standard language models are"},{"from":3964.99,"to":3968.23,"location":2,"content":"unidirectional and that's useful"},{"from":3968.23,"to":3971.76,"location":2,"content":"because it gives you a probability distribution of a language model."},{"from":3971.76,"to":3976.21,"location":2,"content":"But it's bad because you'd like to be able to do"},{"from":3976.21,"to":3981.19,"location":2,"content":"prediction from both sides to understand word meaning and context."},{"from":3981.19,"to":3983.72,"location":2,"content":"There's a second choice, um,"},{"from":3983.72,"to":3989.18,"location":2,"content":"which is you can kind of do bidirectional models when you incorporate,"},{"from":3989.18,"to":3991.7,"location":2,"content":"um, information in both ways."},{"from":3991.7,"to":3995.05,"location":2,"content":"But that sort of has problems as well,"},{"from":3995.05,"to":3997.48,"location":2,"content":"because then you get crosstalk."},{"from":3997.48,"to":4000.61,"location":2,"content":"Um, and so if you run a BiLSTM,"},{"from":4000.61,"to":4003.09,"location":2,"content":"and then you merge the representations by"},{"from":4003.09,"to":4006.76,"location":2,"content":"concatenation and then feed them into the next layer."},{"from":4006.76,"to":4008.66,"location":2,"content":"When you're running the next layer,"},{"from":4008.66,"to":4011.43,"location":2,"content":"the forward LSTM will have already gotten"},{"from":4011.43,"to":4014.39,"location":2,"content":"information about the future from the first layer."},{"from":4014.39,"to":4016.55,"location":2,"content":"Um, so it sort of, um,"},{"from":4016.55,"to":4020.49,"location":2,"content":"ends up with words that have already seen the future themselves."},{"from":4020.49,"to":4023.68,"location":2,"content":"So you have this sort of complex non-generative model."},{"from":4023.68,"to":4028.01,"location":2,"content":"Um, so somehow, they wanted to do things a bit differently,"},{"from":4028.01,"to":4033.6,"location":2,"content":"so they can have bidirectional context without words being able to see themselves."},{"from":4033.6,"to":4036.91,"location":2,"content":"And the idea that they came up with is well,"},{"from":4036.91,"to":4041.43,"location":2,"content":"we're gonna train things with a transformer encoder."},{"from":4041.43,"to":4046.51,"location":2,"content":"But what we're gonna do is mask out some of the words in the sentence,"},{"from":4046.51,"to":4050.16,"location":2,"content":"like, maybe we'll mask here store and gallon."},{"from":4050.16,"to":4054.18,"location":2,"content":"And then, so our language mod- our language modelling like"},{"from":4054.18,"to":4056.13,"location":2,"content":"objective will no longer be"},{"from":4056.13,"to":4060.09,"location":2,"content":"a true language model that's sort of generating a probability of a sentence,"},{"from":4060.09,"to":4063.7,"location":2,"content":"um, which is standardly done by working from left to right,"},{"from":4063.7,"to":4069.39,"location":2,"content":"but it will instead be a Mad Libs style fill in the blank objective."},{"from":4069.39,"to":4072.12,"location":2,"content":"So you'll see this context,"},{"from":4072.12,"to":4073.8,"location":2,"content":"which will be literally,"},{"from":4073.8,"to":4076.97,"location":2,"content":"\"The man went to the mask to buy a mask of milk.\""},{"from":4076.97,"to":4080.79,"location":2,"content":"And your, what's your training objective is to say,"},{"from":4080.79,"to":4083.43,"location":2,"content":"try and predict what this word is,"},{"from":4083.43,"to":4088.03,"location":2,"content":"which you can do with a cross entropy loss to the extent that you don't guess store."},{"from":4088.03,"to":4092.88,"location":2,"content":"And then, it will be trying to guess what this word is and you want to let guess gallon."},{"from":4092.88,"to":4094.99,"location":2,"content":"So you're training a model,"},{"from":4094.99,"to":4097.92,"location":2,"content":"um, to fill in these blanks."},{"from":4097.92,"to":4102.84,"location":2,"content":"Um, and the rate at which they blank words is essentially one word in seven,"},{"from":4102.84,"to":4105.23,"location":2,"content":"and they discuss how this is a trade-off."},{"from":4105.23,"to":4108.54,"location":2,"content":"Because if you blank too few words,"},{"from":4108.54,"to":4110.7,"location":2,"content":"it gets very expensive to train."},{"from":4110.7,"to":4112.59,"location":2,"content":"And if you blank many words,"},{"from":4112.59,"to":4115.55,"location":2,"content":"well you've blanked out most of the context of a word,"},{"from":4115.55,"to":4118.06,"location":2,"content":"and that means it's not very useful for training,"},{"from":4118.06,"to":4122.32,"location":2,"content":"and they found about sort of one in seven seemed to work pretty well for them."},{"from":4122.32,"to":4126.59,"location":2,"content":"But what they want to argue is, um,"},{"from":4126.59,"to":4131.22,"location":2,"content":"that for the OpenAI's GPT,"},{"from":4131.22,"to":4133.47,"location":2,"content":"which is also a transformer model."},{"from":4133.47,"to":4136.85,"location":2,"content":"It's a sort of a classic language model working from"},{"from":4136.85,"to":4140.7,"location":2,"content":"left to right and so you only get left context."},{"from":4140.7,"to":4143.81,"location":2,"content":"Um, for the BERT language model,"},{"from":4143.81,"to":4147.28,"location":2,"content":"sorry, the ELMo language model that's shown up at the top."},{"from":4147.28,"to":4151.68,"location":2,"content":"Um, well, they're running a left to right language model and they're running,"},{"from":4151.68,"to":4153.99,"location":2,"content":"um, right to left language models."},{"from":4153.99,"to":4156.03,"location":2,"content":"So in some sense, um,"},{"from":4156.03,"to":4158.3,"location":2,"content":"they have context from both sides."},{"from":4158.3,"to":4162.69,"location":2,"content":"But these two language models are trained completely independently"},{"from":4162.69,"to":4167.27,"location":2,"content":"and then you're just sort of concatenating their representations, um, together."},{"from":4167.27,"to":4172.17,"location":2,"content":"So there's no sense in which we're actually kind of having a model that's jointly"},{"from":4172.17,"to":4177.93,"location":2,"content":"using context from both sides at the time though that the pre-trained,"},{"from":4177.93,"to":4180.93,"location":2,"content":"um, contextual word representations are built."},{"from":4180.93,"to":4185.94,"location":2,"content":"So their hope is using inside a transformer model"},{"from":4185.94,"to":4187.98,"location":2,"content":"this trick of blanking out words,"},{"from":4187.98,"to":4193.29,"location":2,"content":"and predicting it using the entire context will allow them to use two-sided context,"},{"from":4193.29,"to":4195.54,"location":2,"content":"and be much more effective."},{"from":4195.54,"to":4200.02,"location":2,"content":"And that's what they seem to show, um."},{"from":4200.02,"to":4203.84,"location":2,"content":"There's one other complication and,"},{"from":4203.84,"to":4205.48,"location":2,"content":"I mean, I'll show later."},{"from":4205.48,"to":4209.84,"location":2,"content":"Um, this last complication is a bit useful,"},{"from":4209.84,"to":4213,"location":2,"content":"but it's sort of not really essential to their main idea,"},{"from":4213,"to":4214.85,"location":2,"content":"was that they thought,"},{"from":4214.85,"to":4218.55,"location":2,"content":"one of the, one of the goals in their head was clearly to be able to"},{"from":4218.55,"to":4222.66,"location":2,"content":"have this be useful for things like question answering,"},{"from":4222.66,"to":4225.08,"location":2,"content":"um, tasks, or, um,"},{"from":4225.08,"to":4226.77,"location":2,"content":"natural language inference tasks,"},{"from":4226.77,"to":4230.64,"location":2,"content":"and their relationships between, um, two sentences."},{"from":4230.64,"to":4232.26,"location":2,"content":"So, their idea was, well,"},{"from":4232.26,"to":4236.43,"location":2,"content":"one good objective is this fill in the blank word objective which is,"},{"from":4236.43,"to":4239.09,"location":2,"content":"sort of, like language modeling objective."},{"from":4239.09,"to":4242.31,"location":2,"content":"But they thought it would be useful to have a second objective"},{"from":4242.31,"to":4245.93,"location":2,"content":"where you're predicting relationships between sentences."},{"from":4245.93,"to":4251.41,"location":2,"content":"So, they secondly have a loss function which is, um,"},{"from":4251.41,"to":4254.67,"location":2,"content":"let's have two sentences where"},{"from":4254.67,"to":4258.36,"location":2,"content":"the sentences might be two successive sentences in the text,"},{"from":4258.36,"to":4262.65,"location":2,"content":"or a sentence followed by a random sentence from somewhere else."},{"from":4262.65,"to":4266.48,"location":2,"content":"And we want to train the system to predict when you've,"},{"from":4266.48,"to":4270.93,"location":2,"content":"seeing an- a correct next sentence versus a random sentence."},{"from":4270.93,"to":4276.33,"location":2,"content":"And so you're also training a loss based on this next sentence prediction task."},{"from":4276.33,"to":4279.66,"location":2,"content":"And so it'll be something like: The man went to the store."},{"from":4279.66,"to":4281.43,"location":2,"content":"He bought a gallon of milk."},{"from":4281.43,"to":4284.61,"location":2,"content":"You're meant to predict true is the next sentence,"},{"from":4284.61,"to":4286.74,"location":2,"content":"um: The man went to the store."},{"from":4286.74,"to":4288.09,"location":2,"content":"Penguins are flightless."},{"from":4288.09,"to":4289.52,"location":2,"content":"You're meant to say false."},{"from":4289.52,"to":4291.28,"location":2,"content":"This isn't the next sentence."},{"from":4291.28,"to":4293.58,"location":2,"content":"And so they're simultaneously also,"},{"from":4293.58,"to":4296.32,"location":2,"content":"um, training with this representation."},{"from":4296.32,"to":4300.35,"location":2,"content":"So, what they end up looks, looks like this."},{"from":4300.35,"to":4304.24,"location":2,"content":"Um, so, they have,"},{"from":4304.24,"to":4305.49,"location":2,"content":"um, for the input,"},{"from":4305.49,"to":4307.17,"location":2,"content":"they'll have a pair of sentences."},{"from":4307.17,"to":4308.7,"location":2,"content":"My dog is cute."},{"from":4308.7,"to":4310.1,"location":2,"content":"Um, separator."},{"from":4310.1,"to":4311.93,"location":2,"content":"He likes playing."},{"from":4311.93,"to":4317.95,"location":2,"content":"Um, the words are represented as word pieces like we talked about last week."},{"from":4317.95,"to":4321.57,"location":2,"content":"Um, so there's a token embedding for each word piece."},{"from":4321.57,"to":4325.35,"location":2,"content":"Um, then there's a positional embedding for"},{"from":4325.35,"to":4329.53,"location":2,"content":"each word piece which is gonna be summed with the token embedding."},{"from":4329.53,"to":4334.47,"location":2,"content":"And then finally, there's a segment embedding for each word piece which is simply"},{"from":4334.47,"to":4337.05,"location":2,"content":"whether it comes from the first sentence or"},{"from":4337.05,"to":4339.91,"location":2,"content":"the second sentence before or after the separator."},{"from":4339.91,"to":4344.94,"location":2,"content":"So, you're summing those three things together to get the token representations."},{"from":4344.94,"to":4348.91,"location":2,"content":"And then you're going to use those in a transformer model"},{"from":4348.91,"to":4353.84,"location":2,"content":"where you will have losses to the extent that you can't predict the masked words."},{"from":4353.84,"to":4358.41,"location":2,"content":"And then your binary prediction function as to whether there's"},{"from":4358.41,"to":4363.52,"location":2,"content":"a correct next sentence or not which is the training architecture."},{"from":4363.52,"to":4367.48,"location":2,"content":"Okay. So, it's a transformer as before,"},{"from":4367.48,"to":4370.74,"location":2,"content":"it's trained on Wikipedia plus the BookCorpus."},{"from":4370.74,"to":4372.72,"location":2,"content":"And they built two models."},{"from":4372.72,"to":4377.18,"location":2,"content":"Um, the Base-BERT model was a twelve layer transformer."},{"from":4377.18,"to":4382.47,"location":2,"content":"And so this corresponded to what the previous transformer paper had used, right?"},{"from":4382.47,"to":4389.19,"location":2,"content":"Those two layer transformer blocks repeated six times gave you 12 layers with 768 hidden,"},{"from":4389.19,"to":4394.66,"location":2,"content":"um, dimension hidden states and 12 heads for the multi-head attention."},{"from":4394.66,"to":4396.48,"location":2,"content":"And then they went bigger,"},{"from":4396.48,"to":4398.61,"location":2,"content":"um, and trained BERT-Large which is,"},{"from":4398.61,"to":4400.62,"location":2,"content":"sort of, double the number of layers,"},{"from":4400.62,"to":4403.48,"location":2,"content":"bigger hidden states, even more attention heads."},{"from":4403.48,"to":4406.41,"location":2,"content":"Um, and training these on,"},{"from":4406.41,"to":4409.19,"location":2,"content":"um, pods of TPUs."},{"from":4409.19,"to":4413.85,"location":2,"content":"Um, so, first of all, you're training, um,"},{"from":4413.85,"to":4418.26,"location":2,"content":"on this basis for masked words and,"},{"from":4418.26,"to":4420.38,"location":2,"content":"um, next sentence or not."},{"from":4420.38,"to":4425.94,"location":2,"content":"Um, so then what they wanted to say was this pre-trained model,"},{"from":4425.94,"to":4431.69,"location":2,"content":"um, evaluated on these losses and masked language model and next sentence prediction."},{"from":4431.69,"to":4434.93,"location":2,"content":"Um, we could then take this model,"},{"from":4434.93,"to":4439.05,"location":2,"content":"fr- freeze most of its what weak. No, sorry, that's wrong."},{"from":4439.05,"to":4441.27,"location":2,"content":"We could take this model, um,"},{"from":4441.27,"to":4446.61,"location":2,"content":"pre-trained and it would be incredibly useful for various different tasks."},{"from":4446.61,"to":4448.8,"location":2,"content":"We could use it for named entity recognition,"},{"from":4448.8,"to":4452.31,"location":2,"content":"question answering, natural language inference et cetera."},{"from":4452.31,"to":4454.89,"location":2,"content":"And the way we're going to do it, is kind of,"},{"from":4454.89,"to":4458.55,"location":2,"content":"doing the same thing as the ULMFit model did."},{"from":4458.55,"to":4460.76,"location":2,"content":"We're not just going to say here's our,"},{"from":4460.76,"to":4465.24,"location":2,"content":"here's a contextual word representation like ELMo did."},{"from":4465.24,"to":4469.56,"location":2,"content":"Instead, what we're gonna say is just keep on using this,"},{"from":4469.56,"to":4472.23,"location":2,"content":"keep on using this um,"},{"from":4472.23,"to":4476.88,"location":2,"content":"transformer network that we trained as a, sort of,"},{"from":4476.88,"to":4482.53,"location":2,"content":"language model, but fine tune it for a particular task."},{"from":4482.53,"to":4485.19,"location":2,"content":"So, you're now going to run this transformer"},{"from":4485.19,"to":4489.18,"location":2,"content":"calculating representations for a particular task."},{"from":4489.18,"to":4495.99,"location":2,"content":"And what we're going to change is we're going to remove the very top-level prediction."},{"from":4495.99,"to":4500.41,"location":2,"content":"The bits that predict the mass language model and next sentence prediction."},{"from":4500.41,"to":4502.77,"location":2,"content":"And we're going to substitute on it,"},{"from":4502.77,"to":4508.08,"location":2,"content":"on top, um, a final prediction layer that's appropriate for the task."},{"from":4508.08,"to":4511.01,"location":2,"content":"So, if our task is SQuAD question answering,"},{"from":4511.01,"to":4516.34,"location":2,"content":"our final prediction layer will be predicting start of span and end of span,"},{"from":4516.34,"to":4520.74,"location":2,"content":"kind of, like when we saw DrQA a couple of weeks ago."},{"from":4520.74,"to":4523.98,"location":2,"content":"If what we're doing is the NER task,"},{"from":4523.98,"to":4526.89,"location":2,"content":"our final prediction layer will be predicting"},{"from":4526.89,"to":4533.9,"location":2,"content":"the net- named entity recognition class of each token just like a standard NER system."},{"from":4533.9,"to":4542.77,"location":2,"content":"Okay, um, and so they built this system and tested it on a whole bunch of data sets."},{"from":4542.77,"to":4545.61,"location":2,"content":"Um, one of the main things they tested on was"},{"from":4545.61,"to":4548.63,"location":2,"content":"this GLUE data set which has a whole bunch of tasks."},{"from":4548.63,"to":4550.17,"location":2,"content":"A lot of the tasks, they're,"},{"from":4550.17,"to":4553.53,"location":2,"content":"uh, natural language inference tasks."},{"from":4553.53,"to":4557.2,"location":2,"content":"And I've kept saying that phrase all of this lecture but I haven't really defined it."},{"from":4557.2,"to":4560.82,"location":2,"content":"So, with a natural language inference you're given two sentences"},{"from":4560.82,"to":4565.94,"location":2,"content":"like: Hills and mountains are especially sanctified in Jainism."},{"from":4565.94,"to":4569.55,"location":2,"content":"And then you can write a hypothesis on: Jainism hates nature."},{"from":4569.55,"to":4571.53,"location":2,"content":"And what you're meant to say is,"},{"from":4571.53,"to":4573.57,"location":2,"content":"whether the hypothesis, um,"},{"from":4573.57,"to":4575.51,"location":2,"content":"follows from the premise,"},{"from":4575.51,"to":4579.24,"location":2,"content":"contradicts the premise, or has no relation to the premise."},{"from":4579.24,"to":4581.27,"location":2,"content":"So, that's a three-way classification."},{"from":4581.27,"to":4583.85,"location":2,"content":"And so here it contradicts the premise."},{"from":4583.85,"to":4590.11,"location":2,"content":"Um, there are various other tasks such as this linguistic acceptability task."},{"from":4590.11,"to":4593.55,"location":2,"content":"Um, but if we look at these, um, GLUE tasks."},{"from":4593.55,"to":4597.73,"location":2,"content":"Um, these are showing the Pre-OpenAI State Of The Art."},{"from":4597.73,"to":4600.73,"location":2,"content":"How well, um, ELMo works."},{"from":4600.73,"to":4603.9,"location":2,"content":"How well OpenAI GPT works,"},{"from":4603.9,"to":4608.41,"location":2,"content":"and then how well do small and large BERT models work."},{"from":4608.41,"to":4613.29,"location":2,"content":"And effectively, what you're finding is,"},{"from":4613.29,"to":4617.37,"location":2,"content":"um, that the OpenAI GPT was so,"},{"from":4617.37,"to":4618.49,"location":2,"content":"you know, pretty good."},{"from":4618.49,"to":4622.45,"location":2,"content":"It showed actually good advances on most of these tasks."},{"from":4622.45,"to":4625.89,"location":2,"content":"For many, but not all of them that broke the previous state of the art,"},{"from":4625.89,"to":4628.99,"location":2,"content":"showing the power of these contextual language models."},{"from":4628.99,"to":4635.2,"location":2,"content":"But the bidirectional form of BERT's prediction just seemed much better again."},{"from":4635.2,"to":4639.18,"location":2,"content":"So, going from this line to this line you're getting depending on"},{"from":4639.18,"to":4643.19,"location":2,"content":"the task about two percent better performance."},{"from":4643.19,"to":4647.01,"location":2,"content":"And so the BERT people actually did their experiments carefully."},{"from":4647.01,"to":4650.43,"location":2,"content":"So, these models are pretty comparable in terms of size,"},{"from":4650.43,"to":4653.77,"location":2,"content":"but the bidirectional context seems to really help."},{"from":4653.77,"to":4655.47,"location":2,"content":"And then what they found was,"},{"from":4655.47,"to":4657.57,"location":2,"content":"well, by going to just a bigger model,"},{"from":4657.57,"to":4661.55,"location":2,"content":"again, you could get another big lift in performance."},{"from":4661.55,"to":4664.74,"location":2,"content":"And so you're getting for many of the tasks about"},{"from":4664.74,"to":4668.15,"location":2,"content":"another two percent lift in performance going into the bigger model."},{"from":4668.15,"to":4671.01,"location":2,"content":"So, this really produced super-strong results."},{"from":4671.01,"to":4674.09,"location":2,"content":"And in general, um, people have found,"},{"from":4674.09,"to":4677.4,"location":2,"content":"um, that BERT continues to give super strong results."},{"from":4677.4,"to":4681.48,"location":2,"content":"So, if I return back to my ConLL NER task,"},{"from":4681.48,"to":4685.26,"location":2,"content":"we had ELMo giving you 92.2,"},{"from":4685.26,"to":4686.64,"location":2,"content":"um, and you, sort of,"},{"from":4686.64,"to":4688.05,"location":2,"content":"continue to get gains."},{"from":4688.05,"to":4693.9,"location":2,"content":"So, BERT Base gets you to 92.4 and BERT Large takes you to 92.8."},{"from":4693.9,"to":4697.65,"location":2,"content":"Though in, um, truth in, truth in description,"},{"from":4697.65,"to":4703.13,"location":2,"content":"there is now a system of beats BERT Large on NER which is actually a character-level,"},{"from":4703.13,"to":4705.99,"location":2,"content":"um, transformer language model from Flair."},{"from":4705.99,"to":4707.84,"location":2,"content":"Um, but, you know,"},{"from":4707.84,"to":4710.79,"location":2,"content":"this continued over to a lot of other things."},{"from":4710.79,"to":4713.86,"location":2,"content":"So, on SQuAD 1.1, um,"},{"from":4713.86,"to":4716.37,"location":2,"content":"BERT immediately just outperformed"},{"from":4716.37,"to":4719.74,"location":2,"content":"everything else that people have been working on for SQuAD for ages."},{"from":4719.74,"to":4722.61,"location":2,"content":"In particular, what was especially dramatic, um,"},{"from":4722.61,"to":4725.98,"location":2,"content":"was the sing- a single BERT model, um,"},{"from":4725.98,"to":4730.77,"location":2,"content":"beat everything else that had been done previously on SQuAD version 1.1,"},{"from":4730.77,"to":4733.57,"location":2,"content":"even though they could also show that an"},{"from":4733.57,"to":4739.81,"location":2,"content":"ensemble of BERT models could give further good, um, performance gains."},{"from":4739.81,"to":4743.06,"location":2,"content":"Um, and as I've mentioned before,"},{"from":4743.06,"to":4745.98,"location":2,"content":"essentially if you look at the SQuAD 2.0, um,"},{"from":4745.98,"to":4748.94,"location":2,"content":"leaderboard, all of the top ranked systems,"},{"from":4748.94,"to":4752.28,"location":2,"content":"um, are using BERT one place or another."},{"from":4752.28,"to":4754.59,"location":2,"content":"Um, and so that,"},{"from":4754.59,"to":4756.06,"location":2,"content":"sort of, led into this,"},{"from":4756.06,"to":4759.57,"location":2,"content":"sort of, new world order, um, that, okay,"},{"from":4759.57,"to":4762.73,"location":2,"content":"it seems like the state of NLP now is to,"},{"from":4762.73,"to":4765.24,"location":2,"content":"if you want to have the best performance,"},{"from":4765.24,"to":4766.41,"location":2,"content":"you want to be using"},{"from":4766.41,"to":4771.85,"location":2,"content":"these deep pre-trained transformer stacks to get the best performance."},{"from":4771.85,"to":4773.22,"location":2,"content":"And so this is, sort of, making,"},{"from":4773.22,"to":4775.41,"location":2,"content":"um, NLP more like vision."},{"from":4775.41,"to":4778.56,"location":2,"content":"Because really vision for five years has had"},{"from":4778.56,"to":4782.73,"location":2,"content":"these deep pre-trained neural network stacks, um, like ResNets."},{"from":4782.73,"to":4787.12,"location":2,"content":"Where for most vision tasks what you do is you take a pre-trained ResNet,"},{"from":4787.12,"to":4789.87,"location":2,"content":"and then you fine tune a layer at the top to"},{"from":4789.87,"to":4792.87,"location":2,"content":"do some classification tasks you're interested in."},{"from":4792.87,"to":4794.97,"location":2,"content":"And this is, sort of, now, um,"},{"from":4794.97,"to":4797.52,"location":2,"content":"starting to be what's happening in NLP as well."},{"from":4797.52,"to":4800.28,"location":2,"content":"That you can do the same thing by downloading"},{"from":4800.28,"to":4805.88,"location":2,"content":"your pre-trained BERT and fine tuning it to do some particular performance task."},{"from":4805.88,"to":4809.4,"location":2,"content":"Okay, um, that's it for today and more on"},{"from":4809.4,"to":4818.33,"location":2,"content":"transformers on Thursday [NOISE]."}]} \ No newline at end of file diff --git a/bcc-en/14.bcc b/bcc-en/14.bcc new file mode 100644 index 0000000000000000000000000000000000000000..c17c659cb3c9ccc8f4079070e009db7922d5a1cc --- /dev/null +++ b/bcc-en/14.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.52,"to":8.67,"location":2,"content":"Okay. So I'm delighted to introduce,"},{"from":8.67,"to":11.36,"location":2,"content":"um, our first lot of invited speakers."},{"from":11.36,"to":14.46,"location":2,"content":"And so we're gonna have two invited speakers, um, today."},{"from":14.46,"to":16.26,"location":2,"content":"So starting off, um,"},{"from":16.26,"to":18.96,"location":2,"content":"we go and have Ashish Vaswani who's gonna be"},{"from":18.96,"to":23.11,"location":2,"content":"talking about self attention for generative models and in particular,"},{"from":23.11,"to":25.53,"location":2,"content":"um, we'll introduce some of the work on"},{"from":25.53,"to":29.19,"location":2,"content":"transformers that he is well-known for along with his colleagues."},{"from":29.19,"to":32.22,"location":2,"content":"Um and then as a sort of, um,"},{"from":32.22,"to":35.23,"location":2,"content":"a special edition then we're also going to have"},{"from":35.23,"to":39.44,"location":2,"content":"Anna Huang talking about some applications of this work."},{"from":39.44,"to":41.54,"location":2,"content":"There are actually at least a couple of people in the class who are"},{"from":41.54,"to":43.79,"location":2,"content":"actually interested in music applications."},{"from":43.79,"to":48.74,"location":2,"content":"So this will be your one chance in the course to see music applications of deep learning."},{"from":48.74,"to":51.89,"location":2,"content":"Okay, um, so I'll hand it over to Ashish."},{"from":51.89,"to":53.36,"location":2,"content":"Thanks, Chris and, uh, thanks, Evie."},{"from":53.36,"to":55.94,"location":2,"content":"Uh, Anna is actually here to make the class less dull."},{"from":55.94,"to":57.83,"location":2,"content":"So [LAUGHTER] she's the highlight on this one."},{"from":57.83,"to":60.84,"location":2,"content":"So uh, so, uh, hi everyone."},{"from":60.84,"to":63.4,"location":2,"content":"Um, um, uh excited to be here."},{"from":63.4,"to":66.2,"location":2,"content":"This is a very large class."},{"from":66.2,"to":67.92,"location":2,"content":"Uh, first invited speaker,"},{"from":67.92,"to":69.92,"location":2,"content":"no pressure, so hopefully this will all go well."},{"from":69.92,"to":74.97,"location":2,"content":"Uh, so yes, so the talk is going to be about, uh, self attention."},{"from":74.97,"to":78.34,"location":2,"content":"Um, and so the purpose is,"},{"from":78.34,"to":82.69,"location":2,"content":"is not going to be just to talk about a particular model, but, as,"},{"from":82.69,"to":85.85,"location":2,"content":"as, as, as empiricists and, and,"},{"from":85.85,"to":87.83,"location":2,"content":"like, well, I'm an empiricist and I"},{"from":87.83,"to":90.38,"location":2,"content":"consume machine learning to apply it to various tasks."},{"from":90.38,"to":95.21,"location":2,"content":"And, and, and, well, starting point always is to ask this question, you know,"},{"from":95.21,"to":96.8,"location":2,"content":"what are the- what's the structure in"},{"from":96.8,"to":98.72,"location":2,"content":"my dataset or what are the symmetries in my dataset,"},{"from":98.72,"to":101.88,"location":2,"content":"and is there a model that exists that that's a very good- that,"},{"from":101.88,"to":106.01,"location":2,"content":"that has the inductive biases to model these properties that exist in my dataset."},{"from":106.01,"to":108.45,"location":2,"content":"So hopefully, over the course of this, uh,"},{"from":108.45,"to":111.68,"location":2,"content":"this, this lecture Anna and I will convince you that, uh,"},{"from":111.68,"to":114.41,"location":2,"content":"self attention indeed does have some- has"},{"from":114.41,"to":116.17,"location":2,"content":"the ability models and inductive biases that"},{"from":116.17,"to":118.69,"location":2,"content":"potentially could be useful for the problems that you care about."},{"from":118.69,"to":124.79,"location":2,"content":"Um, so, um, this talk is going to be our learning representations primarily of,"},{"from":124.79,"to":127.44,"location":2,"content":"uh, variable length data where we have images but,"},{"from":127.44,"to":130.19,"location":2,"content":"uh, most of it is going to be variable length data."},{"from":130.19,"to":131.96,"location":2,"content":"And, uh, and, and,"},{"from":131.96,"to":134.97,"location":2,"content":"and all of us care about this problem because we- in"},{"from":134.97,"to":137.99,"location":2,"content":"deep learning, and deep learning is all about representation learning."},{"from":137.99,"to":142.66,"location":2,"content":"And if- and building the right tools for learning representations as,"},{"from":142.66,"to":144.83,"location":2,"content":"as, as, as sort of- is an important factor in,"},{"from":144.83,"to":146.6,"location":2,"content":"in achieving empirical success."},{"from":146.6,"to":149.81,"location":2,"content":"Um, now, uh, the models of choice,"},{"from":149.81,"to":152.16,"location":2,"content":"the primary workhorse for"},{"from":152.16,"to":155.6,"location":2,"content":"perhaps even now and or up to this point had been recurrent neural networks."},{"from":155.6,"to":160.73,"location":2,"content":"Um, um, how, how many people here are familiar with RNNs?"},{"from":160.73,"to":163.06,"location":2,"content":"[LAUGHTER] Okay."},{"from":163.06,"to":165.26,"location":2,"content":"So definitely up to this point,"},{"from":165.26,"to":167.69,"location":2,"content":"the primary workhorse have been recurrent neural networks,"},{"from":167.69,"to":170.47,"location":2,"content":"and some of the more, uh, some, uh,"},{"from":170.47,"to":174.54,"location":2,"content":"some gated variants that explicitly add multiplicative interactions like LSTMs,"},{"from":174.54,"to":178.18,"location":2,"content":"they also, they also have mechanisms that allow for better gradient transfer."},{"from":178.18,"to":180.62,"location":2,"content":"And some recent variants like gated, uh,"},{"from":180.62,"to":182.3,"location":2,"content":"recurrent units that are simplification,"},{"from":182.3,"to":186.86,"location":2,"content":"they're kind of the- they're- they dominate this, this recurrent landscape."},{"from":186.86,"to":189.6,"location":2,"content":"Um, and typically how did recurrent neural networks, uh,"},{"from":189.6,"to":192.68,"location":2,"content":"learn or, um, produce representations?"},{"from":192.68,"to":195.76,"location":2,"content":"They consume a string or a sentence, um,"},{"from":195.76,"to":197.63,"location":2,"content":"even an image, imagine, you know,"},{"from":197.63,"to":201.13,"location":2,"content":"in a particular- in sequentially and, uh, at each,"},{"from":201.13,"to":202.53,"location":2,"content":"at each, uh, position,"},{"from":202.53,"to":204.9,"location":2,"content":"at each timestep they produce, they produce a,"},{"from":204.9,"to":206.9,"location":2,"content":"a continuous representation that's"},{"from":206.9,"to":210.84,"location":2,"content":"summarization of, of everything that they've actually crunched through."},{"from":210.84,"to":216.96,"location":2,"content":"Um, now, so in, in, in the,"},{"from":216.96,"to":219.48,"location":2,"content":"in the realm of large data, uh,"},{"from":219.48,"to":221.31,"location":2,"content":"par- having parallel models is,"},{"from":221.31,"to":222.89,"location":2,"content":"is quite, is quite beneficial."},{"from":222.89,"to":225.14,"location":2,"content":"In fact, I was actually reading Oliver Selfridge."},{"from":225.14,"to":226.37,"location":2,"content":"Uh, he was a,"},{"from":226.37,"to":228.77,"location":2,"content":"he was a professor at MIT and, uh, he had this,"},{"from":228.77,"to":233.36,"location":2,"content":"uh, sorry, he wrote the precursor to deep nets its it's called Pandemoniums."},{"from":233.36,"to":234.59,"location":2,"content":"I would recommend everybody to read it."},{"from":234.59,"to":236.45,"location":2,"content":"And he has this fascinating note that, you know,"},{"from":236.45,"to":237.96,"location":2,"content":"if you give me more parallel computation,"},{"from":237.96,"to":239.9,"location":2,"content":"I'll just add more data and make it slower."},{"from":239.9,"to":242.18,"location":2,"content":"So you can consume more data."},{"from":242.18,"to":247.26,"location":2,"content":"Um, and, and recurrence, uh, recurrence sort of just by construction, um,"},{"from":247.26,"to":248.91,"location":2,"content":"limits parallelization because you have to,"},{"from":248.91,"to":251.1,"location":2,"content":"you have to wait until- your wait un-"},{"from":251.1,"to":254.03,"location":2,"content":"for a particular time point to produce a representation."},{"from":254.03,"to":256.27,"location":2,"content":"Um, but if there's any questions,"},{"from":256.27,"to":257.32,"location":2,"content":"please raise your hands, I'll"},{"from":257.32,"to":258.9,"location":2,"content":"hopefully look around and, and,"},{"from":258.9,"to":261.23,"location":2,"content":"uh, be able to attend to your question."},{"from":261.23,"to":264.92,"location":2,"content":"Um, and again, and, and now because we're actually producing these representations,"},{"from":264.92,"to":266.15,"location":2,"content":"we're sort of summarizing,"},{"from":266.15,"to":267.74,"location":2,"content":"you know, if you want to pass information,"},{"from":267.74,"to":269.76,"location":2,"content":"if you want to pass co-reference information,"},{"from":269.76,"to":272.35,"location":2,"content":"then we kind of have to shove all of this inside"},{"from":272.35,"to":276.11,"location":2,"content":"this fixed size vector, so it could potentially be difficult to model."},{"from":276.11,"to":279.63,"location":2,"content":"And, uh, while they have been successful in language, uh,"},{"from":279.63,"to":282.53,"location":2,"content":"explicit they don't have- the architecture"},{"from":282.53,"to":285.89,"location":2,"content":"doesn't have a very clear explicit way to model hierarchy which is,"},{"from":285.89,"to":288.15,"location":2,"content":"which is something that's very important in language."},{"from":288.15,"to":294.39,"location":2,"content":"Um, now, um, so they have been devin- it has been excellent work of,"},{"from":294.39,"to":298.64,"location":2,"content":"a precursor to self attention that actually surmounted some of these difficulties."},{"from":298.64,"to":301.55,"location":2,"content":"And what were these difficulties basically is a convolutional sequence models"},{"from":301.55,"to":305.18,"location":2,"content":"where you have these limited receptive field convolutions that,"},{"from":305.18,"to":307.22,"location":2,"content":"again, consumed the sentence now not,"},{"from":307.22,"to":309.59,"location":2,"content":"not sequentially but in depth."},{"from":309.59,"to":312.13,"location":2,"content":"And they produce representations for every-"},{"from":312.13,"to":314.72,"location":2,"content":"they produce representations of your variable length sequences."},{"from":314.72,"to":317.72,"location":2,"content":"Um, and, uh, they're trivial to"},{"from":317.72,"to":321.11,"location":2,"content":"parallelize because you can apply these convolutions simultaneously at every position."},{"from":321.11,"to":323.03,"location":2,"content":"Each layer is trivial to parallelize."},{"from":323.03,"to":326.17,"location":2,"content":"Uh, the, the, the serial dependencies are only in the number of layers."},{"from":326.17,"to":328.24,"location":2,"content":"Um, you can get, uh,"},{"from":328.24,"to":329.96,"location":2,"content":"you can- you can get"},{"from":329.96,"to":332.75,"location":2,"content":"these local dependencies efficiently because that a single application of"},{"from":332.75,"to":337.48,"location":2,"content":"a convolution can consume all the information inside its local receptive field."},{"from":337.48,"to":339.32,"location":2,"content":"Um, now if you want to have"},{"from":339.32,"to":342.17,"location":2,"content":"these really long distance interactions while you"},{"from":342.17,"to":345.02,"location":2,"content":"don't have to pass through a linear number of steps,"},{"from":345.02,"to":346.06,"location":2,"content":"you still because these,"},{"from":346.06,"to":349.67,"location":2,"content":"because these receptive fields are local you might need something like linear"},{"from":349.67,"to":353.52,"location":2,"content":"and depth or logarithmic if you're doing something like dilated convolutions."},{"from":353.52,"to":356.03,"location":2,"content":"So there's still need- the number of layers that are needed are"},{"from":356.03,"to":359.21,"location":2,"content":"still a function of the length of the of, of your string."},{"from":359.21,"to":361.07,"location":2,"content":"Uh, but they're a great development and they"},{"from":361.07,"to":363.32,"location":2,"content":"actually pushed a lot of research like WaveRNN, for example,"},{"from":363.32,"to":365.23,"location":2,"content":"is a classic sort of success story of"},{"from":365.23,"to":368.82,"location":2,"content":"convolutio- convolutional sequence models even by net."},{"from":368.82,"to":375.08,"location":2,"content":"Um, now, so far attention has been like one of the most important components,"},{"from":375.08,"to":376.81,"location":2,"content":"the sort of content-based,"},{"from":376.81,"to":379.06,"location":2,"content":"you know, memory retrieval mechanism."},{"from":379.06,"to":383.56,"location":2,"content":"And it's content-based because you have your decoder that attends to all this content,"},{"from":383.56,"to":386.63,"location":2,"content":"that's your encoder and then just sort of decides what to wha- what,"},{"from":386.63,"to":388.58,"location":2,"content":"what information to absorb based on how similar"},{"from":388.58,"to":390.98,"location":2,"content":"this content is to every position in the memory."},{"from":390.98,"to":393.44,"location":2,"content":"So this has been a very critical mechanism in,"},{"from":393.44,"to":394.95,"location":2,"content":"uh, in neural machine translation."},{"from":394.95,"to":396.95,"location":2,"content":"So now the question that we asked was, like, why,"},{"from":396.95,"to":400.46,"location":2,"content":"why not just use attention for representations and, uh,"},{"from":400.46,"to":403.79,"location":2,"content":"now here's what sort of a rough framework of this,"},{"from":403.79,"to":406.44,"location":2,"content":"this representation mechanism would look like, uh,"},{"from":406.44,"to":409.63,"location":2,"content":"the way- just sort of repeating what attention is essentially."},{"from":409.63,"to":412.36,"location":2,"content":"Now imagine you have- you want to represent the word,"},{"from":412.36,"to":415.73,"location":2,"content":"re-represent the word representing, you want to construct its new representation."},{"from":415.73,"to":418.61,"location":2,"content":"And then first, uh, you, you attend or you,"},{"from":418.61,"to":420.71,"location":2,"content":"you compare yourself, you compare your content,"},{"from":420.71,"to":422.76,"location":2,"content":"and in the beginning it could just be a word embedding."},{"from":422.76,"to":425.54,"location":2,"content":"Your compare content with all your words, and with all,"},{"from":425.54,"to":427.34,"location":2,"content":"with all the embeddings and based on these,"},{"from":427.34,"to":429.9,"location":2,"content":"based on these compatibilities or these comparisons,"},{"from":429.9,"to":434.18,"location":2,"content":"you produce, uh, you produce a weighted combination of your entire neighborhood,"},{"from":434.18,"to":436.18,"location":2,"content":"and based on that weighted combination you,"},{"from":436.18,"to":437.87,"location":2,"content":"you summarize all that information."},{"from":437.87,"to":440.15,"location":2,"content":"So it's, like, you're re-expressing yourself in certain terms"},{"from":440.15,"to":442.73,"location":2,"content":"of a weighted combination of your entire neighborhood."},{"from":442.73,"to":443.93,"location":2,"content":"That's what attention does,"},{"from":443.93,"to":448.95,"location":2,"content":"and you can add feed-forward layers to basically sort of compute new features for you."},{"from":448.95,"to":454.7,"location":2,"content":"Um, now, um so the first part is going to be about how, like,"},{"from":454.7,"to":457.76,"location":2,"content":"some of the properties of self attention actually help us in text generation, like,"},{"from":457.76,"to":459.32,"location":2,"content":"what inductive biases are actually useful,"},{"from":459.32,"to":460.95,"location":2,"content":"and we empirically showed that indeed they,"},{"from":460.95,"to":463.06,"location":2,"content":"they move the needle in text generation."},{"from":463.06,"to":464.99,"location":2,"content":"And this is going to be about machine translation,"},{"from":464.99,"to":467.42,"location":2,"content":"but there were other work also that we'll talk about later."},{"from":467.42,"to":469.88,"location":2,"content":"So [NOISE] now with this, uh,"},{"from":469.88,"to":471.88,"location":2,"content":"with this sort of, uh,"},{"from":471.88,"to":475.47,"location":2,"content":"with this attention mechanism you get this- we get a constant path length."},{"from":475.47,"to":478,"location":2,"content":"So all pairs or a word can in-"},{"from":478,"to":481.1,"location":2,"content":"position can interact with any position, every position simultaneously."},{"from":481.1,"to":484.25,"location":2,"content":"Um, hopefully if the number of positions is not too many."},{"from":484.25,"to":486.41,"location":2,"content":"Uh, attention just by virtue of, like,"},{"from":486.41,"to":488.06,"location":2,"content":"it's a construction, you have a softmax,"},{"from":488.06,"to":490.2,"location":2,"content":"you have these gating and multiplicative interactions."},{"from":490.2,"to":492.68,"location":2,"content":"And again, I'm not gonna be able to explain why,"},{"from":492.68,"to":494.19,"location":2,"content":"but it's, it's interesting, like,"},{"from":494.19,"to":495.29,"location":2,"content":"you've seen these models, like,"},{"from":495.29,"to":496.4,"location":2,"content":"even, even the, uh,"},{"from":496.4,"to":499.98,"location":2,"content":"even Pixel, PixelCNN, uh, or, um,"},{"from":499.98,"to":501.66,"location":2,"content":"when it was actually modeling images,"},{"from":501.66,"to":504.96,"location":2,"content":"they explicitly had to add these multiplicative interactions inside the model to,"},{"from":504.96,"to":506.88,"location":2,"content":"to basically beat RNNs,"},{"from":506.88,"to":509.39,"location":2,"content":"and attention just by construction gets this because you're,"},{"from":509.39,"to":513.03,"location":2,"content":"you're multiplying the attention probabilities with your, with your activations."},{"from":513.03,"to":514.58,"location":2,"content":"It's trivial to parallelize, why?"},{"from":514.58,"to":519.44,"location":2,"content":"Because you can just do attention with matmuls, especially the variant that we use in our paper,"},{"from":519.44,"to":520.87,"location":2,"content":"uh, in our work."},{"from":520.87,"to":523.89,"location":2,"content":"And, uh, so now the question is"},{"from":523.89,"to":529.16,"location":2,"content":"convolutional sequence to- convolutional sequence models have been very successful in,"},{"from":529.16,"to":532.33,"location":2,"content":"in, in, in ge- generative tasks for text."},{"from":532.33,"to":534.83,"location":2,"content":"Can we actually do the same or achieved the same with, uh,"},{"from":534.83,"to":538.58,"location":2,"content":"with, uh, attention as our primary workhorse for representation learning."},{"from":538.58,"to":543.49,"location":2,"content":"Um, so just to sort of add some context and there's been some,"},{"from":543.49,"to":547.43,"location":2,"content":"there's been some- up to- up to the transformer there have been a lot of"},{"from":547.43,"to":552.02,"location":2,"content":"great work on using self attention primarily for classification within."},{"from":552.02,"to":555.29,"location":2,"content":"There was, there was work on self attention within the confines of,"},{"from":555.29,"to":556.61,"location":2,"content":"like, recurrent neural networks."},{"from":556.61,"to":559.37,"location":2,"content":"Um, perhaps the closest to us is the,"},{"from":559.37,"to":560.91,"location":2,"content":"is the memory networks,"},{"from":560.91,"to":562.82,"location":2,"content":"uh, by Weston, Sukhbaatar,"},{"from":562.82,"to":565.72,"location":2,"content":"where they actually had a version of recurrent attention,"},{"from":565.72,"to":567.29,"location":2,"content":"but they didn't have, uh,"},{"from":567.29,"to":570.71,"location":2,"content":"but they didn't actually- empirically,"},{"from":570.71,"to":573.5,"location":2,"content":"they didn't show it to work on sort of conditional modeling, like,"},{"from":573.5,"to":577.37,"location":2,"content":"uh, translation and their mechanism was, uh, like,"},{"from":577.37,"to":581.55,"location":2,"content":"they were using sort of a fixed- they were using a fixed query at every step."},{"from":581.55,"to":583.5,"location":2,"content":"So there's- it, it leaves something to be desired."},{"from":583.5,"to":587.06,"location":2,"content":"They still had this question, is it actually going to work, um, on,"},{"from":587.06,"to":590.87,"location":2,"content":"on, on large scale machine translation systems or large-scale text generation systems."},{"from":590.87,"to":594.1,"location":2,"content":"So this is sort of the, the culmination of, um,"},{"from":594.1,"to":597.43,"location":2,"content":"of the, the self attention, our self attention work."},{"from":597.43,"to":600.5,"location":2,"content":"This is the tran- the- and we put it together in the transformer model."},{"from":600.5,"to":603.2,"location":2,"content":"And, uh, so how does this look like?"},{"from":603.2,"to":605.98,"location":2,"content":"So we're going to use attention pri- we're going to use"},{"from":605.98,"to":609.39,"location":2,"content":"attention primarily for computing representations so- of your input."},{"from":609.39,"to":611.48,"location":2,"content":"Imagine you're doing English to German translation."},{"from":611.48,"to":614.03,"location":2,"content":"So you have your words, and notice that,"},{"from":614.03,"to":616.61,"location":2,"content":"uh, attention is, uh, permutation invariant."},{"from":616.61,"to":619.22,"location":2,"content":"So you just change the order of your positions."},{"from":619.22,"to":620.91,"location":2,"content":"You change the order of your words and, and,"},{"from":620.91,"to":623.32,"location":2,"content":"uh, it's not going to affect the actual output."},{"from":623.32,"to":625.34,"location":2,"content":"So in ord- in order to maintain order we add,"},{"from":625.34,"to":626.99,"location":2,"content":"we add position representations."},{"from":626.99,"to":629.71,"location":2,"content":"And, uh, there's two kinds that we tried in the paper,"},{"from":629.71,"to":633.18,"location":2,"content":"these, these fantastic sinusoids with no entropy invented."},{"from":633.18,"to":635.63,"location":2,"content":"And we also use learned representations which are"},{"from":635.63,"to":638.09,"location":2,"content":"very plain vanilla both of them work equally well."},{"from":638.09,"to":640.42,"location":2,"content":"Um, and, uh, so,"},{"from":640.42,"to":642.89,"location":2,"content":"so first we have- so the encoder looks as follows, right?"},{"from":642.89,"to":646.97,"location":2,"content":"So we have a self attention layer that just recomputes the representation, uh,"},{"from":646.97,"to":650.09,"location":2,"content":"for every position simultaneously using attention,"},{"from":650.09,"to":651.54,"location":2,"content":"then we have a feed-forward layer."},{"from":651.54,"to":652.82,"location":2,"content":"And we also have residual,"},{"from":652.82,"to":654.38,"location":2,"content":"residual connections and I'll,"},{"from":654.38,"to":656.6,"location":2,"content":"I'll sort of give you a glimpse of what these residual connections"},{"from":656.6,"to":659.09,"location":2,"content":"might be bringing that is between every,"},{"from":659.09,"to":662.99,"location":2,"content":"every layer, and the input we have a skip connection that just adds the activations."},{"from":662.99,"to":665.33,"location":2,"content":"Uh, and then this tuple of, uh,"},{"from":665.33,"to":668.13,"location":2,"content":"self attention and feed-forward layer just essentially repeats."},{"from":668.13,"to":670.22,"location":2,"content":"Now, on the decoder side, uh,"},{"from":670.22,"to":673.92,"location":2,"content":"we've- we, we have a sort of standard encoder decoder architecture."},{"from":673.92,"to":677.5,"location":2,"content":"On the decoder side, we mimic a language model using self attention,"},{"from":677.5,"to":680.3,"location":2,"content":"and the way to mimic a language model using self attention is to impose"},{"from":680.3,"to":683.54,"location":2,"content":"causality by just masking out the positions that you can look at."},{"from":683.54,"to":685.66,"location":2,"content":"So basically, uh,"},{"from":685.66,"to":689.28,"location":2,"content":"the first position it's- it can't look forward, it's illegal to look forward."},{"from":689.28,"to":692.08,"location":2,"content":"It can look at itself because we actually shift the input."},{"from":692.08,"to":694.75,"location":2,"content":"Um, so it's not copying, uh."},{"from":694.75,"to":697.24,"location":2,"content":"It's kind of surprising that parti- with these models,"},{"from":697.24,"to":698.79,"location":2,"content":"it's very easy to copy at one point,"},{"from":698.79,"to":701.59,"location":2,"content":"when early on it was even harder to ge- you know,"},{"from":701.59,"to":703.36,"location":2,"content":"do copying with recurrent models."},{"from":703.36,"to":704.86,"location":2,"content":"But now, at least, you can copy really well,"},{"from":704.86,"to":706.85,"location":2,"content":"which is a positive sign, I think overall."},{"from":706.85,"to":709.83,"location":2,"content":"Um, but, uh, so now on the decoder side, uh,"},{"from":709.83,"to":711.15,"location":2,"content":"we have, uh, we have"},{"from":711.15,"to":714.38,"location":2,"content":"this causal self attention layer followed by encoder-decoder attention,"},{"from":714.38,"to":716.18,"location":2,"content":"where we actually attend to the, uh,"},{"from":716.18,"to":719.45,"location":2,"content":"last layer of the encoder and a feed-forward layer, and this tripled,"},{"from":719.45,"to":720.67,"location":2,"content":"repeats a mul- a few times,"},{"from":720.67,"to":722.95,"location":2,"content":"and at the end we have the standard cross-entropy loss."},{"from":722.95,"to":728.47,"location":2,"content":"Um, and, um, so, um, sort of,"},{"from":728.47,"to":730.65,"location":2,"content":"staring at the- at,"},{"from":730.65,"to":732.74,"location":2,"content":"at our parti- at the particular variant of the self-"},{"from":732.74,"to":735.21,"location":2,"content":"of the attention mechanis- mechanism that we use,"},{"from":735.21,"to":737.97,"location":2,"content":"we went for both- we went for simplicity and speed."},{"from":737.97,"to":741.63,"location":2,"content":"So, um, so how do you actually compute attention?"},{"from":741.63,"to":744.47,"location":2,"content":"So imagine you want to re-represent the position e2."},{"from":744.47,"to":746.93,"location":2,"content":"And, uh, we're going to first linearly,"},{"from":746.93,"to":750.22,"location":2,"content":"linearly transform it into, uh, a query,"},{"from":750.22,"to":752.15,"location":2,"content":"and then we're gonna linearly transform"},{"from":752.15,"to":754.52,"location":2,"content":"every position in your neighborhood"},{"from":754.52,"to":756.51,"location":2,"content":"or let's say every position at the input because this is the,"},{"from":756.51,"to":757.8,"location":2,"content":"uh, uh, the encoder side,"},{"from":757.8,"to":759.17,"location":2,"content":"to, uh, a key."},{"from":759.17,"to":761.68,"location":2,"content":"And these linear transformations can actually be thought as features,"},{"from":761.68,"to":763.1,"location":2,"content":"and I'll talk more about it later on."},{"from":763.1,"to":765.5,"location":2,"content":"So it's like- it's, it's basically a bilinear form."},{"from":765.5,"to":768.36,"location":2,"content":"You're projecting these vectors into a space where dot product is"},{"from":768.36,"to":771.65,"location":2,"content":"a good- where just a dot product is a good proxy for similarity."},{"from":771.65,"to":773.2,"location":2,"content":"Okay? So now, you have your logit,"},{"from":773.2,"to":775.81,"location":2,"content":"so you just do a so- softmax computer convex combination."},{"from":775.81,"to":777.93,"location":2,"content":"And now based on this convex combination,"},{"from":777.93,"to":781.48,"location":2,"content":"you're going to then re-express e2 or in"},{"from":781.48,"to":785.38,"location":2,"content":"terms of this convex combination of all the vectors of all these positions."},{"from":785.38,"to":788.11,"location":2,"content":"And before doing- before doing the convex combination,"},{"from":788.11,"to":790.5,"location":2,"content":"we again do a linear transformation to produce values."},{"from":790.5,"to":793.94,"location":2,"content":"And then we do a second linear transformation just to"},{"from":793.94,"to":797.62,"location":2,"content":"mix this information and pass it through a- pass it through a feedforward layer."},{"from":797.62,"to":799.08,"location":2,"content":"And this is- um,"},{"from":799.08,"to":801.91,"location":2,"content":"and all of this can be expressed basically"},{"from":801.91,"to":804.9,"location":2,"content":"in two- in two- in two-matrix multiplications,"},{"from":804.9,"to":807.62,"location":2,"content":"and the square root factor is just to make sure that these,"},{"from":807.62,"to":809.08,"location":2,"content":"these dot products don't blow up."},{"from":809.08,"to":810.42,"location":2,"content":"It's just a scaling factor."},{"from":810.42,"to":812.14,"location":2,"content":"And, uh, and, and,"},{"from":812.14,"to":813.61,"location":2,"content":"wha- why is this particular- why is"},{"from":813.61,"to":815.74,"location":2,"content":"this mechanism attractive? Well, it's just really fast."},{"from":815.74,"to":817.34,"location":2,"content":"You can do this very quickly on a GPU,"},{"from":817.34,"to":819.01,"location":2,"content":"and simul- you can do it simultaneously for"},{"from":819.01,"to":823.04,"location":2,"content":"all positions with just two matmuls and a softmax."},{"from":823.04,"to":825.32,"location":2,"content":"Um, on the decoder side it's,"},{"from":825.32,"to":826.64,"location":2,"content":"it's exactly the same,"},{"from":826.64,"to":834.59,"location":2,"content":"except we impose causality by just adding 10 e- minus 10 e9 to the logits."},{"from":834.59,"to":838.13,"location":2,"content":"So it basi- it's just- you just get zero probabilities on those positions."},{"from":838.13,"to":840.82,"location":2,"content":"So we just impose causality by, by adding these,"},{"from":840.82,"to":844.41,"location":2,"content":"uh, highly negative values on the attention- on the attention logits."},{"from":844.41,"to":846.75,"location":2,"content":"Um, is, is everything-"},{"from":846.75,"to":847.41,"location":2,"content":"[LAUGHTER]"},{"from":847.41,"to":853.6,"location":2,"content":"I thought that was a question."},{"from":853.6,"to":858.46,"location":2,"content":"So, um, [LAUGHTER] okay so attention is really, uh, attention is cheap."},{"from":858.46,"to":860.89,"location":2,"content":"So because it's- because this variant of"},{"from":860.89,"to":863.9,"location":2,"content":"attention just involve two- involves two matrix multiplications,"},{"from":863.9,"to":866.51,"location":2,"content":"it's quadratic in the length of your sequence."},{"from":866.51,"to":870.82,"location":2,"content":"And now what's the computational profile of RNNs or convolutions?"},{"from":870.82,"to":872.37,"location":2,"content":"They're quadratic in the dimension."},{"from":872.37,"to":875.04,"location":2,"content":"Because, basically, you can just think of a convolution just flattening"},{"from":875.04,"to":878.17,"location":2,"content":"your input or just applying a linear transformation on top of it, right?"},{"from":878.17,"to":880.84,"location":2,"content":"So- and when does this actually become very attractive?"},{"from":880.84,"to":884.14,"location":2,"content":"This becomes very, very attractive when your dimension is,"},{"from":884.14,"to":886.63,"location":2,"content":"uh, much larger than your length."},{"from":886.63,"to":888.46,"location":2,"content":"Which is the case for machine translation."},{"from":888.46,"to":891.34,"location":2,"content":"Now, we will talk about cases when there's- when the- when this is not true,"},{"from":891.34,"to":894.28,"location":2,"content":"and we have to- we have to do a- we have to make other model developments."},{"from":894.28,"to":896.44,"location":2,"content":"Um, but, uh, but for"},{"from":896.44,"to":898.02,"location":2,"content":"short sequences or sequences where"},{"from":898.02,"to":900.02,"location":2,"content":"your length does- where your dimension dominates length,"},{"from":900.02,"to":902.89,"location":2,"content":"attention is a very- has a very favorable computation profile."},{"from":902.89,"to":906.36,"location":2,"content":"And as you can see, it's about four times faster than an RNN."},{"from":906.36,"to":908.92,"location":2,"content":"Um, um, and, and faster than"},{"from":908.92,"to":913,"location":2,"content":"a convolutional model where the- you have a kernel of- like filter with, uh, three."},{"from":913,"to":919.32,"location":2,"content":"So, so there's still one problem."},{"from":919.32,"to":921.46,"location":2,"content":"Now, here's something- so in language,"},{"from":921.46,"to":922.6,"location":2,"content":"typically, we want to know, like,"},{"from":922.6,"to":923.9,"location":2,"content":"who did what to whom, right?"},{"from":923.9,"to":925.77,"location":2,"content":"So now, imagine you applied a convolutional filter."},{"from":925.77,"to":926.83,"location":2,"content":"Because you actually have"},{"from":926.83,"to":930.45,"location":2,"content":"different linear transformations based on let- relative distances,"},{"from":930.45,"to":931.77,"location":2,"content":"like this, this, this, this,"},{"from":931.77,"to":935.32,"location":2,"content":"linear transformation on the word who, uh, o- o- on the concept,"},{"from":935.32,"to":937.69,"location":2,"content":"we can have- can learn this concept of who and, and, and,"},{"from":937.69,"to":940.36,"location":2,"content":"pick out different information from this embedding of the word I."},{"from":940.36,"to":941.93,"location":2,"content":"And this linear transformation,"},{"from":941.93,"to":944.53,"location":2,"content":"the lre- the red linear transformation can pick out different information"},{"from":944.53,"to":947.76,"location":2,"content":"from kicked and the blue linear transformation can pick out different,"},{"from":947.76,"to":949.45,"location":2,"content":"different information from ball."},{"from":949.45,"to":953.23,"location":2,"content":"Now, when you have a single attention layer, this is difficult."},{"from":953.23,"to":955.33,"location":2,"content":"Because all- because they're just a convex combination"},{"from":955.33,"to":957.32,"location":2,"content":"where you have the same linear transformation everywhere."},{"from":957.32,"to":960.22,"location":2,"content":"All that's available to you is just a- is just mixing proportions."},{"from":960.22,"to":963.67,"location":2,"content":"So you can't pick out different pieces of information from different places."},{"from":963.67,"to":970.27,"location":2,"content":"Well, what if we had one attention layer for who?"},{"from":970.27,"to":973.6,"location":2,"content":"So you can think of an attention layer as something like a feature detector almost,"},{"from":973.6,"to":975.04,"location":2,"content":"like, because a particular- it,"},{"from":975.04,"to":978.34,"location":2,"content":"it might try to- it might- because it carries with it a linear transformation,"},{"from":978.34,"to":981.7,"location":2,"content":"so it's projecting them in a space that- which starts caring maybe about syntax,"},{"from":981.7,"to":984.49,"location":2,"content":"or it's projecting in this space which starts caring about who or what."},{"from":984.49,"to":988.94,"location":2,"content":"Uh, then we can have another attention layer for or attention head for what,"},{"from":988.94,"to":991.49,"location":2,"content":"did what, and other- another attention head for,"},{"from":991.49,"to":994.12,"location":2,"content":"for, for whom- to whom."},{"from":994.12,"to":997.3,"location":2,"content":"And all of this can actually be done in parallel,"},{"from":997.3,"to":999.21,"location":2,"content":"and that's actually- and that's exactly what we do."},{"from":999.21,"to":1001.25,"location":2,"content":"And for efficiency, instead of actually"},{"from":1001.25,"to":1004.23,"location":2,"content":"having these dimensions operating in a large space,"},{"from":1004.23,"to":1006.99,"location":2,"content":"we just- we just reduce the dimensionality of all these heads"},{"from":1006.99,"to":1010.23,"location":2,"content":"and we operate these attention layers in parallel, sort of bridging the gap."},{"from":1010.23,"to":1011.67,"location":2,"content":"Now, here's a, uh,"},{"from":1011.67,"to":1013.66,"location":2,"content":"perhaps, well, here's a little quiz."},{"from":1013.66,"to":1016.4,"location":2,"content":"I mean, can you actually- is there"},{"from":1016.4,"to":1021.11,"location":2,"content":"a combination of heads or is there a configuration in which you can,"},{"from":1021.11,"to":1024.26,"location":2,"content":"actually, exactly simulate a convolution probably with more parameters?"},{"from":1024.26,"to":1026.39,"location":2,"content":"I think there should be a simple way to show that if you"},{"from":1026.39,"to":1029.65,"location":2,"content":"had mo- more heads or heads are a function of positions,"},{"from":1029.65,"to":1031.88,"location":2,"content":"you could probably just simulate a convolution,"},{"from":1031.88,"to":1033.38,"location":2,"content":"but- although with a lot of parameters."},{"from":1033.38,"to":1035.15,"location":2,"content":"Uh, so it can- in, in,"},{"from":1035.15,"to":1037.14,"location":2,"content":"in the limit, it can actually simulate a convolution."},{"from":1037.14,"to":1041.28,"location":2,"content":"Uh, and it also- we can al- we can continue to enjoy the benefits of parallelism,"},{"from":1041.28,"to":1043.05,"location":2,"content":"but we did increase the number of softmaxes"},{"from":1043.05,"to":1044.82,"location":2,"content":"because each head then carries with it a softmax."},{"from":1044.82,"to":1047.19,"location":2,"content":"But the amount of FLOPS didn't change because we-"},{"from":1047.19,"to":1050.01,"location":2,"content":"instead of actually having these heads operating in very large dimensions,"},{"from":1050.01,"to":1052.22,"location":2,"content":"they're operating in very small dimensions."},{"from":1052.22,"to":1055.11,"location":2,"content":"Um, so, uh, when we applied this on, on,"},{"from":1055.11,"to":1057.54,"location":2,"content":"on machine translation, um,"},{"from":1057.54,"to":1060.3,"location":2,"content":"we were able to drama- uh, dramatically outperform,"},{"from":1060.3,"to":1063.64,"location":2,"content":"uh, previous results on English-German and English-French translation."},{"from":1063.64,"to":1067.08,"location":2,"content":"So we had a pretty standard setup: 32,000-word vocabularies,"},{"from":1067.08,"to":1070.32,"location":2,"content":"WordPiece encodings, WMT14-, uh,"},{"from":1070.32,"to":1072.54,"location":2,"content":"WMT 2014, uh, was our test set,"},{"from":1072.54,"to":1073.97,"location":2,"content":"2013 did the dev set."},{"from":1073.97,"to":1079.12,"location":2,"content":"And, uh, and some of these results were much stronger than even our previous ensemble models."},{"from":1079.12,"to":1082.69,"location":2,"content":"And, um, and on English-French also,"},{"from":1082.69,"to":1085.4,"location":2,"content":"we had some- we had some very favorabl- favorable results."},{"from":1085.4,"to":1086.63,"location":2,"content":"Uh, and we- and we are,"},{"from":1086.63,"to":1088.42,"location":2,"content":"we, we, we achieved state of the art."},{"from":1088.42,"to":1091.26,"location":2,"content":"Now, ste- stepping back a bit, uh,"},{"from":1091.26,"to":1093.9,"location":2,"content":"I- I'm not claiming that we,"},{"from":1093.9,"to":1097.11,"location":2,"content":"we arrived at an architecture that has better expressivity than an LSTM."},{"from":1097.11,"to":1098.6,"location":2,"content":"I mean, there's, there's, there's,"},{"from":1098.6,"to":1102.88,"location":2,"content":"there's theorems that are- that say that LSTMs can model any function."},{"from":1102.88,"to":1107.87,"location":2,"content":"Um, perhaps, all we did was just build an architecture that was good for SGD."},{"from":1107.87,"to":1110.89,"location":2,"content":"Because stochastic gradient descent could just train this architecture really well,"},{"from":1110.89,"to":1113.4,"location":2,"content":"because the gradient dynamics and attention are very simple attentions,"},{"from":1113.4,"to":1114.62,"location":2,"content":"just a linear combination."},{"from":1114.62,"to":1117.82,"location":2,"content":"And, uh, um, I think that's- I,"},{"from":1117.82,"to":1119.56,"location":2,"content":"I think that's actually favorable."},{"from":1119.56,"to":1122.23,"location":2,"content":"But hopefully, uh, as we- as we go on,"},{"from":1122.23,"to":1123.48,"location":2,"content":"but the- well, I'd,"},{"from":1123.48,"to":1124.86,"location":2,"content":"I'd also like to point out that, you know,"},{"from":1124.86,"to":1127.77,"location":2,"content":"we do explicit mo- we do explicitly model all,"},{"from":1127.77,"to":1129.44,"location":2,"content":"all path connection, all, all,"},{"from":1129.44,"to":1134.07,"location":2,"content":"all pairwise connections and it has its adva- advantage of a very clear modeling,"},{"from":1134.07,"to":1136.58,"location":2,"content":"very clear relationships directly between, between any two words."},{"from":1136.58,"to":1140.64,"location":2,"content":"Um, and, like, hopefully we'll be able to also"},{"from":1140.64,"to":1142.26,"location":2,"content":"show that there are other inductive biases."},{"from":1142.26,"to":1145.61,"location":2,"content":"That it's not just like building more architectures that,"},{"from":1145.61,"to":1148.72,"location":2,"content":"that are good for- that are good inductive biases for SGD."},{"from":1148.72,"to":1153,"location":2,"content":"So frameworks, a lot of our work was initially pushed out in tensor2tensor."},{"from":1153,"to":1155.98,"location":2,"content":"Maybe that might change in the future with the arrival of JAX."},{"from":1155.98,"to":1158.79,"location":2,"content":"There's ano- there's a framework also from Amazon called Sockeye."},{"from":1158.79,"to":1160.81,"location":2,"content":"There's also Fairseq, uh, the se- the"},{"from":1160.81,"to":1163.64,"location":2,"content":"convolutional sequence-to-sequence toolkit from Facebook that the,"},{"from":1163.64,"to":1166.29,"location":2,"content":"they prob- I'm actually not sure if it has a transformer implementation,"},{"from":1166.29,"to":1169.48,"location":2,"content":"but they have some really good sequence-to-sequence models as well."},{"from":1169.48,"to":1171.7,"location":2,"content":"Um, okay."},{"from":1171.7,"to":1172.85,"location":2,"content":"So the importance of residuals."},{"from":1172.85,"to":1177.88,"location":2,"content":"So, uh, we have these resil- residual connections, uh, between, um,"},{"from":1177.88,"to":1181.8,"location":2,"content":"so we have these residual connections that go from here to- here to here,"},{"from":1181.8,"to":1185.24,"location":2,"content":"here to here, like between every pair of layers, and it's interesting."},{"from":1185.24,"to":1187.89,"location":2,"content":"So we, um, we- so what we do is we just"},{"from":1187.89,"to":1191.03,"location":2,"content":"add the position informations at the input to the model."},{"from":1191.03,"to":1193.53,"location":2,"content":"And, uh, we don't infuse- we don't infuse"},{"from":1193.53,"to":1196.15,"location":2,"content":"or we don't inject position information at every layer."},{"from":1196.15,"to":1202.18,"location":2,"content":"So when, uh, we severed these residual connections and we loo- stared at these,"},{"from":1202.18,"to":1204.81,"location":2,"content":"uh, stared at these attention distributions, this is the center or,"},{"from":1204.81,"to":1207.55,"location":2,"content":"sort of, the middle map is this attention distribution."},{"from":1207.55,"to":1210.75,"location":2,"content":"You actually- basically, it- it's been unable to pick this diagonal."},{"from":1210.75,"to":1213.37,"location":2,"content":"It should have a very strong diagonal focus."},{"from":1213.37,"to":1215.33,"location":2,"content":"And so what has happened was these residuals"},{"from":1215.33,"to":1218.15,"location":2,"content":"were carrying this position information to every layer."},{"from":1218.15,"to":1220.82,"location":2,"content":"And because these subsequent layers had no notion of position,"},{"from":1220.82,"to":1222.9,"location":2,"content":"they were fi- finding it hard to actually attend."},{"from":1222.9,"to":1225.88,"location":2,"content":"This is the encoder-decoder attention which typically ends up being diagonal."},{"from":1225.88,"to":1227.38,"location":2,"content":"Now, so then we, uh, we said okay."},{"from":1227.38,"to":1230.7,"location":2,"content":"So then we actually continued with- continued to sever the residuals,"},{"from":1230.7,"to":1232.95,"location":2,"content":"but we added position information back in at every layer."},{"from":1232.95,"to":1234.84,"location":2,"content":"We injected position information back in."},{"from":1234.84,"to":1236.39,"location":2,"content":"And we didn't recover the accuracy,"},{"from":1236.39,"to":1237.79,"location":2,"content":"but we did get some of this,"},{"from":1237.79,"to":1239.3,"location":2,"content":"sort of, diagonal focus back in."},{"from":1239.3,"to":1241.4,"location":2,"content":"So the residuals are doing more, but they're certainly,"},{"from":1241.4,"to":1244.16,"location":2,"content":"definitely moving this position information to the model there."},{"from":1244.16,"to":1246.81,"location":2,"content":"They're pumping this position information through the model."},{"from":1246.81,"to":1249.07,"location":2,"content":"Um, okay."},{"from":1249.07,"to":1251.37,"location":2,"content":"So, so that was- that was- so, so now we saw that,"},{"from":1251.37,"to":1252.44,"location":2,"content":"you know, being able to, sort of,"},{"from":1252.44,"to":1253.89,"location":2,"content":"model both long- and short-,"},{"from":1253.89,"to":1256.51,"location":2,"content":"short-term relationships, uh, sh- uh, long and,"},{"from":1256.51,"to":1258.43,"location":2,"content":"long- and short-distance relationships with,"},{"from":1258.43,"to":1261.74,"location":2,"content":"with attention is beneficial for, for text generation."},{"from":1261.74,"to":1263.53,"location":2,"content":"Um, what kind of inductive,"},{"from":1263.53,"to":1266.78,"location":2,"content":"inductive biases lay- actually, uh, appear, or what,"},{"from":1266.78,"to":1270.86,"location":2,"content":"what kind of phenomena appear in images and something that we constantly see- constantly"},{"from":1270.86,"to":1272.74,"location":2,"content":"see in images and music is this notion of"},{"from":1272.74,"to":1275.18,"location":2,"content":"repeating structure that's very similar to each other?"},{"from":1275.18,"to":1278.06,"location":2,"content":"You have these motifs that repeat in, in different scales."},{"from":1278.06,"to":1281.42,"location":2,"content":"So, for example, there's a b- it's another artificial but beautiful example of"},{"from":1281.42,"to":1284.9,"location":2,"content":"self-similarity where you have this Van Gogh painting where this texture or these,"},{"from":1284.9,"to":1286.41,"location":2,"content":"these little objects just repeat."},{"from":1286.41,"to":1290.28,"location":2,"content":"These images are- these different pieces of the image are very sa- similar to each other,"},{"from":1290.28,"to":1291.6,"location":2,"content":"but they might have different scales."},{"from":1291.6,"to":1292.95,"location":2,"content":"Uh, again in music,"},{"from":1292.95,"to":1294.62,"location":2,"content":"here's a motif that repeats, uh,"},{"from":1294.62,"to":1296.61,"location":2,"content":"that could have- it could have, like,"},{"from":1296.61,"to":1300.12,"location":2,"content":"di- various, like, spans of time between in, in, between it."},{"from":1300.12,"to":1303.25,"location":2,"content":"So, um, so, so this,"},{"from":1303.25,"to":1304.44,"location":2,"content":"so we, we, we,"},{"from":1304.44,"to":1305.78,"location":2,"content":"we attempted after this to see, well,"},{"from":1305.78,"to":1309.72,"location":2,"content":"to ask this question: can self-attention help us in modeling other objects like images?"},{"from":1309.72,"to":1311.71,"location":2,"content":"So the, the path we took was, sort of,"},{"from":1311.71,"to":1317.45,"location":2,"content":"standard auto-regressive image modeling the- or probabilistic image modeling, not GANs."},{"from":1317.45,"to":1318.91,"location":2,"content":"Because it was- well, one, it was very easy."},{"from":1318.91,"to":1320.09,"location":2,"content":"We had a language model almost."},{"from":1320.09,"to":1322.24,"location":2,"content":"So this is just like language modeling on images."},{"from":1322.24,"to":1323.91,"location":2,"content":"Uh, and also training at maximum,"},{"from":1323.91,"to":1324.93,"location":2,"content":"likely, it allows you to, sort of,"},{"from":1324.93,"to":1326.72,"location":2,"content":"measure, measure how well you're doing on,"},{"from":1326.72,"to":1328.78,"location":2,"content":"uh, on, on your held-out set."},{"from":1328.78,"to":1330.84,"location":2,"content":"Uh, and it also gives you diversity,"},{"from":1330.84,"to":1332.79,"location":2,"content":"so you hopefully are covering all possible, uh,"},{"from":1332.79,"to":1335.9,"location":2,"content":"different kinds of images you- So, um,"},{"from":1335.9,"to":1337.29,"location":2,"content":"and to this point there's al- we had"},{"from":1337.29,"to":1338.88,"location":2,"content":"an advantage that's also been- there are- there've been"},{"from":1338.88,"to":1342.42,"location":2,"content":"good work on using recurrent models like PixelRNN and PixelCNN,"},{"from":1342.42,"to":1346.33,"location":2,"content":"that, that we're actually getting some very good compression rates. Um-"},{"from":1346.33,"to":1351.81,"location":2,"content":"And, um, again here,"},{"from":1351.81,"to":1355.61,"location":2,"content":"originally the argument was that, well, you know,"},{"from":1355.61,"to":1357.92,"location":2,"content":"in images because there- because you want symmetry,"},{"from":1357.92,"to":1359.3,"location":2,"content":"because you want like if you have a face,"},{"from":1359.3,"to":1361.58,"location":2,"content":"you want, you want one ear to sort of match with the other."},{"from":1361.58,"to":1363.58,"location":2,"content":"If you had a large receptive field,"},{"from":1363.58,"to":1367.13,"location":2,"content":"which you could potentially get with attention at a lower computational cost,"},{"from":1367.13,"to":1370.82,"location":2,"content":"then it should benefit- then it should be quite beneficial for, for images,"},{"from":1370.82,"to":1373.64,"location":2,"content":"for images and you wouldn't need many layers like you do in"},{"from":1373.64,"to":1377.95,"location":2,"content":"convolutions to actually get dependencies between these far away pixels."},{"from":1377.95,"to":1380.67,"location":2,"content":"So it seem like self-attention would have been a- what, what,"},{"from":1380.67,"to":1383.74,"location":2,"content":"what was already a good computational mechanism, right?"},{"from":1383.74,"to":1386.58,"location":2,"content":"But this sort of- but it was actually interesting to see"},{"from":1386.58,"to":1389.7,"location":2,"content":"how it even modeled- naturally modeled self-similarity,"},{"from":1389.7,"to":1392.46,"location":2,"content":"and people have used self-similarity in image generation like, you know, uh,"},{"from":1392.46,"to":1395.76,"location":2,"content":"there's this really cool work by Efros where they actually see, okay,"},{"from":1395.76,"to":1398.67,"location":2,"content":"in the training set, what are those patches that are really,"},{"from":1398.67,"to":1399.81,"location":2,"content":"that are really similar to me?"},{"from":1399.81,"to":1401.64,"location":2,"content":"And based on the patches that are really similar to me,"},{"from":1401.64,"to":1403.04,"location":2,"content":"I'm going to fill up the information."},{"from":1403.04,"to":1405.49,"location":2,"content":"So it's like actually doing image generation."},{"from":1405.49,"to":1407.43,"location":2,"content":"Uh, there is this really classic work called"},{"from":1407.43,"to":1409.98,"location":2,"content":"non-local means where they do image denoising,"},{"from":1409.98,"to":1411.91,"location":2,"content":"where they want to denoise this sort of,"},{"from":1411.91,"to":1414.45,"location":2,"content":"this patch P. And they say,"},{"from":1414.45,"to":1418.29,"location":2,"content":"I'm going to- based on my similarity between all other patches in my image,"},{"from":1418.29,"to":1421.13,"location":2,"content":"I'm going to compute some function of content-based similarity,"},{"from":1421.13,"to":1423.53,"location":2,"content":"and based on the similarity I'm going to pull information."},{"from":1423.53,"to":1426.64,"location":2,"content":"So as- and exploiting this fact that images are very self-similar."},{"from":1426.64,"to":1430.44,"location":2,"content":"And, uh, uh, this has also been sort of,"},{"from":1430.44,"to":1432.39,"location":2,"content":"uh, applied in some recent work."},{"from":1432.39,"to":1435.03,"location":2,"content":"Now if you just took this encoder self-attention mechanism"},{"from":1435.03,"to":1437.17,"location":2,"content":"and just replace these word embeddings with patches,"},{"from":1437.17,"to":1438.77,"location":2,"content":"and that's kind of exactly what it's doing."},{"from":1438.77,"to":1441.33,"location":2,"content":"It's, it's computing this notion of content-based similarity"},{"from":1441.33,"to":1444.12,"location":2,"content":"between these elements and then based on this content-based similarity,"},{"from":1444.12,"to":1447.51,"location":2,"content":"it constructs a convex combination that essentially brings these things together."},{"from":1447.51,"to":1449.38,"location":2,"content":"So it's, it's a very ni- it was,"},{"from":1449.38,"to":1451.55,"location":2,"content":"it was quite- it was very pleasant to see that,"},{"from":1451.55,"to":1453.97,"location":2,"content":"oh, this is a differentiable way of doing non-local means."},{"from":1453.97,"to":1462.03,"location":2,"content":"And, uh, and we took the transformer architecture and replaced words with pixels."},{"from":1462.03,"to":1466.01,"location":2,"content":"Uh, there was some- there were some architecture adjustments to do."},{"from":1466.01,"to":1468.3,"location":2,"content":"And, uh, so this was but- this was"},{"from":1468.3,"to":1471.09,"location":2,"content":"basically the kind of- it was very similar to the original work,"},{"from":1471.09,"to":1474.36,"location":2,"content":"and here the position representations instead of being, you know,"},{"from":1474.36,"to":1476.76,"location":2,"content":"one-dimensional, they were- because we are not dealing with sequences,"},{"from":1476.76,"to":1478.35,"location":2,"content":"we have two-dimensional position representations."},{"from":1478.35,"to":1480.37,"location":2,"content":"Um, okay."},{"from":1480.37,"to":1482.07,"location":2,"content":"So I pointed out before,"},{"from":1482.07,"to":1485.78,"location":2,"content":"attention is a very com- very favorable computational profile"},{"from":1485.78,"to":1489.27,"location":2,"content":"if your length- if your dimension dominates length,"},{"from":1489.27,"to":1491.25,"location":2,"content":"which if- which is absolutely untrue for,"},{"from":1491.25,"to":1492.54,"location":2,"content":"absolutely untrue for images."},{"from":1492.54,"to":1496.17,"location":2,"content":"Uh, because even for like 32 by- even for 32 by 32 images,"},{"from":1496.17,"to":1499.26,"location":2,"content":"when you flatten them and you- and you flatten them, you have 30- you get 30,"},{"from":1499.26,"to":1502.96,"location":2,"content":"72 positions, uh, so it's your standard CFIR image."},{"from":1502.96,"to":1506.4,"location":2,"content":"Um, so simple solution, uh,"},{"from":1506.4,"to":1509.22,"location":2,"content":"because like convolutions of- I mean,"},{"from":1509.22,"to":1511.14,"location":2,"content":"you get- convolutions are basically looked"},{"from":1511.14,"to":1513.35,"location":2,"content":"at local windows and you get translational equivariance."},{"from":1513.35,"to":1516.66,"location":2,"content":"We said, \"Okay. Let's adopt the same strategy.\""},{"from":1516.66,"to":1519.22,"location":2,"content":"And also there's a lot of spatial locality and images."},{"from":1519.22,"to":1524.27,"location":2,"content":"Uh, but now, we will still have a better computational profile."},{"from":1524.27,"to":1527.34,"location":2,"content":"If your- if your receptive field is still smaller than your dimension,"},{"from":1527.34,"to":1529.48,"location":2,"content":"you can afford- you can actually still do"},{"from":1529.48,"to":1534.6,"location":2,"content":"much more long distance computation than a standard convolution because you're,"},{"from":1534.6,"to":1537.62,"location":2,"content":"uh, because you're quadratic in length."},{"from":1537.62,"to":1540.39,"location":2,"content":"So as long as we didn't increase our length beyond the dimension,"},{"from":1540.39,"to":1542.41,"location":2,"content":"we still had a favorable computational profile."},{"from":1542.41,"to":1544.44,"location":2,"content":"And so the way we did it was, uh,"},{"from":1544.44,"to":1546.19,"location":2,"content":"we essentially had, uh,"},{"from":1546.19,"to":1547.95,"location":2,"content":"two kinds of rasterizations."},{"from":1547.95,"to":1552.21,"location":2,"content":"So we had a one-dimensional rasterization where you had a sort of single query block,"},{"from":1552.21,"to":1554.79,"location":2,"content":"uh, which was, uh,"},{"from":1554.79,"to":1558.66,"location":2,"content":"which was then attending or to the- into a larger memory block,"},{"from":1558.66,"to":1562.68,"location":2,"content":"uh, in this rasterized fashion along the- along, along the rows."},{"from":1562.68,"to":1565.32,"location":2,"content":"Um, then we tried another form of rasterization,"},{"from":1565.32,"to":1567.78,"location":2,"content":"falling standard two-dimensional locality,"},{"from":1567.78,"to":1570.36,"location":2,"content":"where you had- where we actually produced the image in,"},{"from":1570.36,"to":1573.3,"location":2,"content":"uh, in blocks and within each block we had a rasterization scheme."},{"from":1573.3,"to":1578.64,"location":2,"content":"Um, again, these- the image transformer layer was very similar."},{"from":1578.64,"to":1581.28,"location":2,"content":"We had two-dimensional position representations along"},{"from":1581.28,"to":1584.67,"location":2,"content":"with query- with the same- with a very similar attention mechanism."},{"from":1584.67,"to":1587.22,"location":2,"content":"Um, and we tried"},{"from":1587.22,"to":1590.55,"location":2,"content":"both super-resolution and unconditional and conditional image generation."},{"from":1590.55,"to":1594.07,"location":2,"content":"Uh, this is- this is Ne- Niki Parmar,"},{"from":1594.07,"to":1597.26,"location":2,"content":"I and a co- and a few other authors from Brain,"},{"from":1597.26,"to":1599.58,"location":2,"content":"um, and we presented it at ICML."},{"from":1599.58,"to":1604.82,"location":2,"content":"And, uh, we were able to achieve better perplexity than existing models."},{"from":1604.82,"to":1607.65,"location":2,"content":"So PixelSNAIL is actually another model that used- mixed"},{"from":1607.65,"to":1610.69,"location":2,"content":"both convolutions and self-attention and they- they outperformed us on,"},{"from":1610.69,"to":1612.67,"location":2,"content":"on, on, on, on, bits per dimension."},{"from":1612.67,"to":1614.31,"location":2,"content":"So we were measuring perplexity because these are"},{"from":1614.31,"to":1616.56,"location":2,"content":"probabilistic- these are probabilistic models."},{"from":1616.56,"to":1618.67,"location":2,"content":"It's like basically a language model of images and,"},{"from":1618.67,"to":1621.12,"location":2,"content":"and it just- and your- and the factorization"},{"from":1621.12,"to":1623.58,"location":2,"content":"of your language model just depends on how you rasterize."},{"from":1623.58,"to":1625.62,"location":2,"content":"In the- in this- in the one-D rasterization,"},{"from":1625.62,"to":1627,"location":2,"content":"we went first rows and then columns."},{"from":1627,"to":1628.18,"location":2,"content":"In the two-D rasterization,"},{"from":1628.18,"to":1631.15,"location":2,"content":"we went blockwise and inside each block we rasterized."},{"from":1631.15,"to":1634.65,"location":2,"content":"On ImageNet, we achieved better perplexities, and,"},{"from":1634.65,"to":1638.78,"location":2,"content":"uh, so yeah, I mean we're at a GAN level, right?"},{"from":1638.78,"to":1643.69,"location":2,"content":"I mean this weird- this is- I think probabilist auto-regressive Image generation,"},{"from":1643.69,"to":1646.72,"location":2,"content":"uh, by this point had not reached GANs."},{"from":1646.72,"to":1651.09,"location":2,"content":"At ICLR 2019, there's a paper by Nal that actually uses self-attention and gets very,"},{"from":1651.09,"to":1652.58,"location":2,"content":"very good quality images."},{"from":1652.58,"to":1654.7,"location":2,"content":"But what we, what we observed was,"},{"from":1654.7,"to":1656.67,"location":2,"content":"we were getting structured objects fairly well."},{"from":1656.67,"to":1659.93,"location":2,"content":"Like can people recognize what the second row is?"},{"from":1659.93,"to":1663.77,"location":2,"content":"Cars. [OVERLAPPING]"},{"from":1663.77,"to":1666.05,"location":2,"content":"I heard- I said- most- almost everyone said cars."},{"from":1666.05,"to":1668.72,"location":2,"content":"I'm not going to ask who said something else, but yes, they're cars."},{"from":1668.72,"to":1673.35,"location":2,"content":"yeah. And, uh, so the- and the last row is another vehicles like,"},{"from":1673.35,"to":1678.33,"location":2,"content":"uh, so essentially when structured jo- structured objects were easy to capture."},{"from":1678.33,"to":1681.16,"location":2,"content":"Um, like frogs and sort of,"},{"from":1681.16,"to":1684.15,"location":2,"content":"you know, objects that were camouflaged just turned into this mush."},{"from":1684.15,"to":1687.09,"location":2,"content":"Um, and- but on super resolution,"},{"from":1687.09,"to":1688.65,"location":2,"content":"now super-resolution is interesting because"},{"from":1688.65,"to":1690.38,"location":2,"content":"there's a lot of conditioning information, right?"},{"from":1690.38,"to":1693.53,"location":2,"content":"And, uh, when you have a lot of conditioning information, the,"},{"from":1693.53,"to":1695.52,"location":2,"content":"the sort of possible- you break- you,"},{"from":1695.52,"to":1697.9,"location":2,"content":"you actually lock quite a few of the modes."},{"from":1697.9,"to":1700.02,"location":2,"content":"So there's only a few options you can have at the output."},{"from":1700.02,"to":1702.39,"location":2,"content":"And super- our super resolution results are much better."},{"from":1702.39,"to":1706.77,"location":2,"content":"We were able to get better facial orientation and structure than previous work."},{"from":1706.77,"to":1711.39,"location":2,"content":"And these are samples at different temperatures and, uh, and, uh,"},{"from":1711.39,"to":1714.69,"location":2,"content":"and we wou- when we quantify this with actual human evaluators,"},{"from":1714.69,"to":1716.16,"location":2,"content":"we- like we flash an image and said,"},{"from":1716.16,"to":1717.35,"location":2,"content":"is this real, is this false?"},{"from":1717.35,"to":1718.63,"location":2,"content":"And we were able to, uh,"},{"from":1718.63,"to":1720.75,"location":2,"content":"we were able to fool humans like four"},{"from":1720.75,"to":1723.29,"location":2,"content":"times better than previous results in super resolution."},{"from":1723.29,"to":1726.99,"location":2,"content":"Again, these are not- these results like I, I guess the,"},{"from":1726.99,"to":1730.47,"location":2,"content":"the latest GAN result from Nvidia makes us look like a joke."},{"from":1730.47,"to":1731.71,"location":2,"content":"But, I mean this is,"},{"from":1731.71,"to":1733.05,"location":2,"content":"I mean, we're starting later than GAN."},{"from":1733.05,"to":1734.13,"location":2,"content":"So hopefully we'll catch up."},{"from":1734.13,"to":1737.25,"location":2,"content":"But, but the point here is that this is an interesting inductive bias for images,"},{"from":1737.25,"to":1739.5,"location":2,"content":"so very natural inductive bias for images."},{"from":1739.5,"to":1741.38,"location":2,"content":"Um, and, uh, and,"},{"from":1741.38,"to":1745.65,"location":2,"content":"and there is hope to apply it- for applying in classification and other such tasks also."},{"from":1745.65,"to":1747.45,"location":2,"content":"Um, so one interesting thing,"},{"from":1747.45,"to":1749.64,"location":2,"content":"just to sort of both out of curiosity and"},{"from":1749.64,"to":1752.74,"location":2,"content":"asking how good is maximum or like does maximum likelihood."},{"from":1752.74,"to":1756.18,"location":2,"content":"Well, one, does the model actually capture some interesting structure in the role?"},{"from":1756.18,"to":1757.65,"location":2,"content":"Second, do you get diversity?"},{"from":1757.65,"to":1759.54,"location":2,"content":"Well, maximum likelihood should get diversity,"},{"from":1759.54,"to":1761.95,"location":2,"content":"by, by virtue, by virtue of what it does."},{"from":1761.95,"to":1763.86,"location":2,"content":"Uh, so then we just- we did image completion."},{"from":1763.86,"to":1765.87,"location":2,"content":"And why is- why image completion because as soon as you"},{"from":1765.87,"to":1768,"location":2,"content":"lock down half the image to the goal truth,"},{"from":1768,"to":1770.61,"location":2,"content":"you're actually shaving off a lot of the possible modes."},{"from":1770.61,"to":1772.23,"location":2,"content":"So you have a much easier time sampling."},{"from":1772.23,"to":1774.09,"location":2,"content":"So, uh, so the first is,"},{"from":1774.09,"to":1775.93,"location":2,"content":"uh, first is what we supply to the model."},{"from":1775.93,"to":1778.79,"location":2,"content":"The, the, the right row- the right most column is,"},{"from":1778.79,"to":1781.15,"location":2,"content":"is gold, and we were able to generate different samples."},{"from":1781.15,"to":1783.27,"location":2,"content":"But what was really interesting is the third row."},{"from":1783.27,"to":1786.18,"location":2,"content":"Uh, so the rightmost column is- the rightmost column is gold."},{"from":1786.18,"to":1788.65,"location":2,"content":"Uh, now if you look at the third row, this horse."},{"from":1788.65,"to":1792.13,"location":2,"content":"So actually there's this sort of glimpse or a suggestion of a pull,"},{"from":1792.13,"to":1795.05,"location":2,"content":"but the model hallucinated a human in some of these,"},{"from":1795.05,"to":1796.17,"location":2,"content":"in some of these images,"},{"from":1796.17,"to":1798.45,"location":2,"content":"which is interesting like in- it does capture at least"},{"from":1798.45,"to":1802.23,"location":2,"content":"the data teaches it to capture some structure about the world."},{"from":1802.23,"to":1806.19,"location":2,"content":"Um, the dog is just cute and I guess it also shows that, you know,"},{"from":1806.19,"to":1807.48,"location":2,"content":"there was this entire object,"},{"from":1807.48,"to":1810.66,"location":2,"content":"this chair, that the model just completely refused to imagine."},{"from":1810.66,"to":1812.84,"location":2,"content":"So there's a lot of difficulty."},{"from":1812.84,"to":1815.07,"location":2,"content":"And I guess Anna is gonna talk about"},{"from":1815.07,"to":1819.46,"location":2,"content":"[NOISE] the another way to exploit self- self-similarity."},{"from":1819.46,"to":1820.08,"location":2,"content":"Thank you."},{"from":1820.08,"to":1831.6,"location":2,"content":"[APPLAUSE]"},{"from":1831.6,"to":1834.06,"location":2,"content":"So thank you Ashish for the introduction."},{"from":1834.06,"to":1837.11,"location":2,"content":"Uh, so there's a lot of self-similarity in images."},{"from":1837.11,"to":1839.46,"location":2,"content":"There's also a lot of self-similarity in, in music."},{"from":1839.46,"to":1843.18,"location":2,"content":"So we can imagine, transformer being a, a good model for it."},{"from":1843.18,"to":1846.06,"location":2,"content":"Uh, we- we're going to show how,"},{"from":1846.06,"to":1848.1,"location":2,"content":"uh, we can add more to,"},{"from":1848.1,"to":1850.35,"location":2,"content":"to the self attention, to think more about kind of"},{"from":1850.35,"to":1854.22,"location":2,"content":"relational information and how that could help, uh, music generation."},{"from":1854.22,"to":1857.16,"location":2,"content":"[NOISE] So, uh, first I want to"},{"from":1857.16,"to":1861.22,"location":2,"content":"clarify what is the raw representation that we're working with right now."},{"from":1861.22,"to":1863.28,"location":2,"content":"So analogous to language,"},{"from":1863.28,"to":1867.06,"location":2,"content":"you can think about there's text and somebody is reading out a text,"},{"from":1867.06,"to":1869.43,"location":2,"content":"so they add their kind of own intonations to it,"},{"from":1869.43,"to":1872.38,"location":2,"content":"and then you have sound waves coming out of that speech."},{"from":1872.38,"to":1876.1,"location":2,"content":"So for music there's a va- very similar kind of, uh,"},{"from":1876.1,"to":1881.27,"location":2,"content":"line of a generation where you say the composer has an idea,"},{"from":1881.27,"to":1883.36,"location":2,"content":"uh, writes down the score and then,"},{"from":1883.36,"to":1885.58,"location":2,"content":"a performer performs it and then you get sound."},{"from":1885.58,"to":1889.54,"location":2,"content":"So what we're going to focus on today is mostly, uh,"},{"from":1889.54,"to":1891.41,"location":2,"content":"you can think of the score but it's actually,"},{"from":1891.41,"to":1894.08,"location":2,"content":"er, a performance, um,"},{"from":1894.08,"to":1901.55,"location":2,"content":"in that it's a symbolic representation where MIDI pianos were used and,"},{"from":1901.55,"to":1904.07,"location":2,"content":"uh, um, professional amateur, uh,"},{"from":1904.07,"to":1906.64,"location":2,"content":"musicians were performing on the pianos."},{"from":1906.64,"to":1907.89,"location":2,"content":"So we have the recorded,"},{"from":1907.89,"to":1909.66,"location":2,"content":"uh, information of their playing."},{"from":1909.66,"to":1911.3,"location":2,"content":"So in particular, um,"},{"from":1911.3,"to":1915.81,"location":2,"content":"at each time se- step modeling music as this sequential, uh,"},{"from":1915.81,"to":1918.72,"location":2,"content":"process, what is being output are, okay,"},{"from":1918.72,"to":1920.14,"location":2,"content":"turn this note on, ah,"},{"from":1920.14,"to":1921.96,"location":2,"content":"advance the clock by this much,"},{"from":1921.96,"to":1923.22,"location":2,"content":"and then turn this note off."},{"from":1923.22,"to":1925.96,"location":2,"content":"And also there is, uh, dynamics information,"},{"from":1925.96,"to":1927.66,"location":2,"content":"so when you turn the note on, you first say like,"},{"from":1927.66,"to":1929.98,"location":2,"content":"how loud it's going to be."},{"from":1929.98,"to":1933.09,"location":2,"content":"Uh, so traditionally, uh, modeling, uh,"},{"from":1933.09,"to":1935.09,"location":2,"content":"music as kind of a language,"},{"from":1935.09,"to":1938.13,"location":2,"content":"we've been using, uh, recurrent neural networks."},{"from":1938.13,"to":1943.35,"location":2,"content":"And, um, because as Ashish introduced and, and talked about,"},{"from":1943.35,"to":1945.51,"location":2,"content":"there is a lot of compression that needs to happen,"},{"from":1945.51,"to":1949.83,"location":2,"content":"like a long sequence has to be embedded into like a fixed length vector."},{"from":1949.83,"to":1952.2,"location":2,"content":"And that becomes hard when, uh,"},{"from":1952.2,"to":1955.2,"location":2,"content":"in music you have- you have repetition coming,"},{"from":1955.2,"to":1957.15,"location":2,"content":"um, at a distance."},{"from":1957.15,"to":1959.49,"location":2,"content":"So, uh, I'm first going to show you,"},{"from":1959.49,"to":1963.27,"location":2,"content":"um, samples from, from the RNNs,"},{"from":1963.27,"to":1966.45,"location":2,"content":"from a transformer and then from a music transformer that has"},{"from":1966.45,"to":1968.58,"location":2,"content":"the relative attention and kind of let you hear"},{"from":1968.58,"to":1971.97,"location":2,"content":"the differences and then I'll go into how we,"},{"from":1971.97,"to":1974.58,"location":2,"content":"uh, what are, what are the, uh,"},{"from":1974.58,"to":1978.66,"location":2,"content":"modifications we needed to do on top of the, uh, transformer model."},{"from":1978.66,"to":1980.74,"location":2,"content":"Uh, so here, uh,"},{"from":1980.74,"to":1983.3,"location":2,"content":"this task is kind of the image completion task."},{"from":1983.3,"to":1988.34,"location":2,"content":"So we give it an initial motif and then we ask the model to do continuations."},{"from":1988.34,"to":1990.66,"location":2,"content":"So this is the motif that we fed."},{"from":1990.66,"to":1996.22,"location":2,"content":"[MUSIC] How many people recognize that?"},{"from":1996.22,"to":1999.09,"location":2,"content":"Awesome. Okay. [LAUGHTER] Yeah,"},{"from":1999.09,"to":2000.38,"location":2,"content":"so this is a, uh,"},{"from":2000.38,"to":2002.9,"location":2,"content":"kind of a fragment from a Chopin Etude piece."},{"from":2002.9,"to":2004.91,"location":2,"content":"And we're going to ask, uh,"},{"from":2004.91,"to":2006.68,"location":2,"content":"the RNN to do a continuation."},{"from":2006.68,"to":2014.99,"location":2,"content":"[NOISE]"},{"from":2014.99,"to":2028.33,"location":2,"content":"[MUSIC]"},{"from":2028.33,"to":2030.95,"location":2,"content":"So in here, like in the beginning, it was trying to repeat it."},{"from":2030.95,"to":2032.33,"location":2,"content":"But very fast, it, er,"},{"from":2032.33,"to":2035.87,"location":2,"content":"wandered off into, its other different ideas."},{"from":2035.87,"to":2038.12,"location":2,"content":"So that's one challenge because it's, uh,"},{"from":2038.12,"to":2041.7,"location":2,"content":"not able to directly look back to what happened in the past, uh, and,"},{"from":2041.7,"to":2044.06,"location":2,"content":"and can just look at kind of a blu- blurry version,"},{"from":2044.06,"to":2046.4,"location":2,"content":"and that blurry version becomes more and more blurry."},{"from":2046.4,"to":2048.45,"location":2,"content":"Uh, so this is what the transformer does."},{"from":2048.45,"to":2050.99,"location":2,"content":"Uh, so so, uh, a detail is, uh,"},{"from":2050.99,"to":2054.45,"location":2,"content":"these models are trained on half the length that you're hearing."},{"from":2054.45,"to":2058.76,"location":2,"content":"So we're kinda asking the model to generalize beyond the length that it's trained on."},{"from":2058.76,"to":2060.17,"location":2,"content":"And you can see for this transformer,"},{"from":2060.17,"to":2062.28,"location":2,"content":"it, it deteriorates beyond that."},{"from":2062.28,"to":2065.15,"location":2,"content":"But it can hold the motif pretty consistent."},{"from":2065.15,"to":2074.69,"location":2,"content":"[MUSIC] Okay. You, you,"},{"from":2074.69,"to":2075.77,"location":2,"content":"you ge- you get the idea."},{"from":2075.77,"to":2080.69,"location":2,"content":"[LAUGHTER] So initially, it was able to do this repetition really well."},{"from":2080.69,"to":2082.4,"location":2,"content":"Uh, so it was able to copy it very well."},{"from":2082.4,"to":2084.17,"location":2,"content":"But beyond the length that was trained on,"},{"from":2084.17,"to":2087.44,"location":2,"content":"it kinda didn't know how to cope with, like longer contexts."},{"from":2087.44,"to":2088.88,"location":2,"content":"And, uh, what you see,"},{"from":2088.88,"to":2091.32,"location":2,"content":"uh, the, the last one is from the music transformer."},{"from":2091.32,"to":2093.35,"location":2,"content":"I think so that kind of [NOISE] the relational information."},{"from":2093.35,"to":2096.47,"location":2,"content":"And you can just see visually how it's very consistent and kinda"},{"from":2096.47,"to":2099.94,"location":2,"content":"repeating these [NOISE] these larger, uh, arcs."},{"from":2099.94,"to":2121.19,"location":2,"content":"[MUSIC]"},{"from":2121.19,"to":2123.82,"location":2,"content":"Yeah. So that was, uh, music transformer."},{"from":2123.82,"to":2127.07,"location":2,"content":"And so in music,"},{"from":2127.07,"to":2130.41,"location":2,"content":"the, the self similarity that we talked about, uh,"},{"from":2130.41,"to":2131.76,"location":2,"content":"so we see, uh,"},{"from":2131.76,"to":2132.95,"location":2,"content":"the motif here, and so,"},{"from":2132.95,"to":2135.01,"location":2,"content":"so there we primed the model with a motif,"},{"from":2135.01,"to":2136.45,"location":2,"content":"and this is actually a sample,"},{"from":2136.45,"to":2137.87,"location":2,"content":"unconditioned sample from the model."},{"from":2137.87,"to":2140.69,"location":2,"content":"So nothing, er, there was no priming that the, uh,"},{"from":2140.69,"to":2142.88,"location":2,"content":"model kinda had to create its own motif and then,"},{"from":2142.88,"to":2145.11,"location":2,"content":"uh, do, uh, continuations from there."},{"from":2145.11,"to":2149.21,"location":2,"content":"And here, uh, if we kinda look at it and analyze it a bit, you see,"},{"from":2149.21,"to":2151.8,"location":2,"content":"uh, a lot of repetition,"},{"from":2151.8,"to":2154.04,"location":2,"content":"uh, with gaps in between."},{"from":2154.04,"to":2156.64,"location":2,"content":"And if you look at the self attention structure,"},{"from":2156.64,"to":2158.87,"location":2,"content":"we actually do see the model,"},{"from":2158.87,"to":2160.63,"location":2,"content":"uh, looking at the relevant parts."},{"from":2160.63,"to":2164.07,"location":2,"content":"Even if, if it was not immediately, uh, preceding it."},{"from":2164.07,"to":2165.5,"location":2,"content":"So, so here, uh,"},{"from":2165.5,"to":2169.97,"location":2,"content":"what I colored shaded out is where the motif, um, occurs."},{"from":2169.97,"to":2171.83,"location":2,"content":"Uh, and you can, uh, see the different colors,"},{"from":2171.83,"to":2174.71,"location":2,"content":"there's a different attention heads and they're kinda focusing,"},{"from":2174.71,"to":2176.81,"location":2,"content":"uh, among those, uh, grayed out sections."},{"from":2176.81,"to":2179.75,"location":2,"content":"[NOISE] So I'll play the sample and we also have"},{"from":2179.75,"to":2183.7,"location":2,"content":"a visualization that kind of shows you as the music is pa- uh,"},{"from":2183.7,"to":2188.93,"location":2,"content":"is being played or what notes it was attending to as it was predicting that note."},{"from":2188.93,"to":2191.15,"location":2,"content":"And, uh, this was generated from scratch."},{"from":2191.15,"to":2193.88,"location":2,"content":"And, uh, so the self attention is, um,"},{"from":2193.88,"to":2197.27,"location":2,"content":"from, from kind of note to note level or event to event level."},{"from":2197.27,"to":2199.32,"location":2,"content":"So it's, it's quite low level."},{"from":2199.32,"to":2200.97,"location":2,"content":"Uh, so when you look at it, it's,"},{"from":2200.97,"to":2202.66,"location":2,"content":"it's ki- a little bit overwhelming."},{"from":2202.66,"to":2204.35,"location":2,"content":"It has like multiple heads and,"},{"from":2204.35,"to":2205.93,"location":2,"content":"er, a lot of things moving."},{"from":2205.93,"to":2207.95,"location":2,"content":"Uh, but there's kind of these structural moments"},{"from":2207.95,"to":2210.28,"location":2,"content":"where you would kind of see more of this, uh,"},{"from":2210.28,"to":2212.8,"location":2,"content":"clean, uh, kind of,"},{"from":2212.8,"to":2215.27,"location":2,"content":"uh, sections where it's attending to."},{"from":2215.27,"to":2272.39,"location":2,"content":"[MUSIC]"},{"from":2272.39,"to":2273.71,"location":2,"content":"VOkay. So, um,"},{"from":2273.71,"to":2275.69,"location":2,"content":"how, how did we do that?"},{"from":2275.69,"to":2279.44,"location":2,"content":"And so starting from kind of the the regular attention mechanism,"},{"from":2279.44,"to":2282.7,"location":2,"content":"we know it's, uh, a weighted average of the past history."},{"from":2282.7,"to":2284.69,"location":2,"content":"Uh, and the nice thing is, uh,"},{"from":2284.69,"to":2287.16,"location":2,"content":"however far it is, we have direct access to it."},{"from":2287.16,"to":2288.84,"location":2,"content":"So if we know, uh,"},{"from":2288.84,"to":2290.87,"location":2,"content":"there are kind of motifs that occurred,"},{"from":2290.87,"to":2293,"location":2,"content":"uh, in in early on in the piece,"},{"from":2293,"to":2295.39,"location":2,"content":"we're still able to based on, uh,"},{"from":2295.39,"to":2297.08,"location":2,"content":"the fact that things that are similar,"},{"from":2297.08,"to":2299.24,"location":2,"content":"uh, to be able to retrieve those."},{"from":2299.24,"to":2302.91,"location":2,"content":"Um, but, uh, it also becomes,"},{"from":2302.91,"to":2305.03,"location":2,"content":"all the past becomes kind of a bag of words,"},{"from":2305.03,"to":2307.31,"location":2,"content":"like there is no structure of which came,"},{"from":2307.31,"to":2308.57,"location":2,"content":"uh, before or after."},{"from":2308.57,"to":2311.2,"location":2,"content":"So there's the positional sinusoids that Ashish talked about."},{"from":2311.2,"to":2313.59,"location":2,"content":"That, uh, basically in this, uh,"},{"from":2313.59,"to":2318.39,"location":2,"content":"indices indexes into a sinusoids that are moving at different speeds."},{"from":2318.39,"to":2320.64,"location":2,"content":"And so close-by positions would have, uh,"},{"from":2320.64,"to":2322.16,"location":2,"content":"a very similar kind of, uh,"},{"from":2322.16,"to":2326.32,"location":2,"content":"cross section into those multiple sinusoids."},{"from":2326.32,"to":2328.8,"location":2,"content":"Uh, in contrast for, er,"},{"from":2328.8,"to":2330.92,"location":2,"content":"for convolutions, you kinda have this, uh,"},{"from":2330.92,"to":2334.94,"location":2,"content":"fixed filter that's moving around that captures the relative distance."},{"from":2334.94,"to":2336.88,"location":2,"content":"Like 1B4, 2B4."},{"from":2336.88,"to":2339.18,"location":2,"content":"And these are kind of, uh,"},{"from":2339.18,"to":2342.93,"location":2,"content":"in some ways like a rigid structure that allows you to be, uh,"},{"from":2342.93,"to":2344.93,"location":2,"content":"a kind of, uh, bring in the,"},{"from":2344.93,"to":2347.44,"location":2,"content":"the distance information very explicitly."},{"from":2347.44,"to":2350.77,"location":2,"content":"Um, you can imagine relative attention, um,"},{"from":2350.77,"to":2353.08,"location":2,"content":"with the multiple heads, uh, at play,"},{"from":2353.08,"to":2355.39,"location":2,"content":"uh, to be some combination of these."},{"from":2355.39,"to":2357.17,"location":2,"content":"So, uh, on one hand,"},{"from":2357.17,"to":2358.58,"location":2,"content":"you can access, uh,"},{"from":2358.58,"to":2360.49,"location":2,"content":"the the history very directly."},{"from":2360.49,"to":2362.51,"location":2,"content":"On the other hand, you also know, er,"},{"from":2362.51,"to":2365.21,"location":2,"content":"how you rel- relate to this history."},{"from":2365.21,"to":2366.86,"location":2,"content":"Uh, capturing for example,"},{"from":2366.86,"to":2369.57,"location":2,"content":"like translational invariance and, er,"},{"from":2369.57,"to":2372.44,"location":2,"content":"and we, uh, and for example,"},{"from":2372.44,"to":2375.45,"location":2,"content":"we think one of the reasons why in the beginning, uh,"},{"from":2375.45,"to":2378.83,"location":2,"content":"priming samples that you heard that the, uh,"},{"from":2378.83,"to":2380.95,"location":2,"content":"music transformer was able to generate"},{"from":2380.95,"to":2383.74,"location":2,"content":"beyond the length that it was trained on at a very coherent way,"},{"from":2383.74,"to":2387.83,"location":2,"content":"is that it's able to kind of rely on this translational invariance to to carry,"},{"from":2387.83,"to":2390.78,"location":2,"content":"uh, the relational information forward."},{"from":2390.78,"to":2395,"location":2,"content":"So, if we take a closer look at how how how the,"},{"from":2395,"to":2396.55,"location":2,"content":"how this works is, uh,"},{"from":2396.55,"to":2398.54,"location":2,"content":"the regular transformer you have,"},{"from":2398.54,"to":2400.25,"location":2,"content":"you compare all the queries and keys,"},{"from":2400.25,"to":2402.26,"location":2,"content":"so you get kind of this, uh, square matrix."},{"from":2402.26,"to":2404.39,"location":2,"content":"You can think of it as like a self similarity,"},{"from":2404.39,"to":2406.01,"location":2,"content":"uh, matrix, so it's, uh, a square."},{"from":2406.01,"to":2408.89,"location":2,"content":"Uh, what relative attention does is,"},{"from":2408.89,"to":2412.36,"location":2,"content":"to add an additional term that thinks, uh,"},{"from":2412.36,"to":2414.53,"location":2,"content":"that thinks about whenever you're comparing two things,"},{"from":2414.53,"to":2416.21,"location":2,"content":"how far are you apart?"},{"from":2416.21,"to":2418.82,"location":2,"content":"And also based on the content, do I,"},{"from":2418.82,"to":2421.34,"location":2,"content":"do I care about things that are two steps away or"},{"from":2421.34,"to":2424.18,"location":2,"content":"three steps away or I maybe care about things that are recurring,"},{"from":2424.18,"to":2426.28,"location":2,"content":"at kind of a periodical distance."},{"from":2426.28,"to":2429.3,"location":2,"content":"And, uh, with that information gathered,"},{"from":2429.3,"to":2433.84,"location":2,"content":"that influences, uh, the the similarity between positions."},{"from":2433.84,"to":2435.82,"location":2,"content":"And in particular, uh,"},{"from":2435.82,"to":2439.46,"location":2,"content":"this extra term is based on, um, the distance."},{"from":2439.46,"to":2440.51,"location":2,"content":"So you wanna, uh,"},{"from":2440.51,"to":2441.95,"location":2,"content":"gather the embeddings, uh,"},{"from":2441.95,"to":2444.5,"location":2,"content":"that's irrelevant to the, uh,"},{"from":2444.5,"to":2446.18,"location":2,"content":"the query key distances,"},{"from":2446.18,"to":2449.28,"location":2,"content":"uh, on the [NOISE] on the logits."},{"from":2449.28,"to":2451.72,"location":2,"content":"So, in translation, this,"},{"from":2451.72,"to":2453.23,"location":2,"content":"uh, has shown, uh,"},{"from":2453.23,"to":2455.01,"location":2,"content":"a lot of improvement in,"},{"from":2455.01,"to":2457.73,"location":2,"content":"um, for example English to to German translation."},{"from":2457.73,"to":2460.01,"location":2,"content":"Uh, but in translation,"},{"from":2460.01,"to":2461.76,"location":2,"content":"the sequences are usually quite short."},{"from":2461.76,"to":2463.41,"location":2,"content":"It's only a sentence to sentence."},{"from":2463.41,"to":2465.11,"location":2,"content":"Uh, a translation for example,"},{"from":2465.11,"to":2467.21,"location":2,"content":"maybe 50 words or 100 words."},{"from":2467.21,"to":2472.01,"location":2,"content":"But the music, er, samples that you've heard are in the range of 2,000 time-steps."},{"from":2472.01,"to":2476.01,"location":2,"content":"So it's like 2,000 tokens need to be able to fit in memory."},{"from":2476.01,"to":2477.5,"location":2,"content":"So this was a problem, uh,"},{"from":2477.5,"to":2483.35,"location":2,"content":"because the original formulation relied on building this 3D tensor that's,"},{"from":2483.35,"to":2485.8,"location":2,"content":"uh, that's very large in memory."},{"from":2485.8,"to":2487.72,"location":2,"content":"Um, and and why this is the case?"},{"from":2487.72,"to":2490.05,"location":2,"content":"It's because for every pair,"},{"from":2490.05,"to":2492.71,"location":2,"content":"uh, you look up what the,"},{"from":2492.71,"to":2495.2,"location":2,"content":"what the re- so you can compute what the relative distance is,"},{"from":2495.2,"to":2498.32,"location":2,"content":"and then you look up an embedding that corresponds to that distance."},{"from":2498.32,"to":2503.54,"location":2,"content":"So, um, for like this there's a length by length, like L by L, uh, matrix."},{"from":2503.54,"to":2504.82,"location":2,"content":"You need like, uh,"},{"from":2504.82,"to":2507.64,"location":2,"content":"to collect embeddings for each of the positions and that's, uh,"},{"from":2507.64,"to":2511.07,"location":2,"content":"depth D. So that gives us the 3D."},{"from":2511.07,"to":2512.9,"location":2,"content":"What we realized is,"},{"from":2512.9,"to":2518.48,"location":2,"content":"you can actually just directly multiply the queries and the embedding distances."},{"from":2518.48,"to":2520.67,"location":2,"content":"[NOISE] And they, uh,"},{"from":2520.67,"to":2522.08,"location":2,"content":"come out kind of in a different order,"},{"from":2522.08,"to":2524.63,"location":2,"content":"because now you have the queries ordered by a relative distance,"},{"from":2524.63,"to":2527.93,"location":2,"content":"but you need the queries ordered by keys, uh,"},{"from":2527.93,"to":2531.44,"location":2,"content":"which is kind of a absolute by absolute, uh, configuration."},{"from":2531.44,"to":2533.36,"location":2,"content":"So what we could do is just, uh,"},{"from":2533.36,"to":2536.7,"location":2,"content":"do a series of skewing, uh,"},{"from":2536.7,"to":2540.51,"location":2,"content":"to to put it into the right, uh, configuration."},{"from":2540.51,"to":2543.88,"location":2,"content":"And this is, uh, yeah."},{"from":2543.88,"to":2545.57,"location":2,"content":"Just a, just a quick contrast to,"},{"from":2545.57,"to":2548.48,"location":2,"content":"to show, um, the difference in memory requirements."},{"from":2548.48,"to":2551.7,"location":2,"content":"So, er, a lot of the times the challenge is in, uh,"},{"from":2551.7,"to":2553.82,"location":2,"content":"being able to scale, uh, you know,"},{"from":2553.82,"to":2557.66,"location":2,"content":"being able to be more memory efficient so that [NOISE] you can model longer sequences."},{"from":2557.66,"to":2560.42,"location":2,"content":"So with that, uh, this is,"},{"from":2560.42,"to":2562.85,"location":2,"content":"um, I can play you one more example if we have time."},{"from":2562.85,"to":2565.13,"location":2,"content":"But if we don't have time, we can, go ahead."},{"from":2565.13,"to":2566.18,"location":2,"content":"We'll see more of that."},{"from":2566.18,"to":2567.98,"location":2,"content":"Okay. [LAUGHTER] So this is,"},{"from":2567.98,"to":2569.93,"location":2,"content":"this is, uh, maybe a one, uh,"},{"from":2569.93,"to":2574.48,"location":2,"content":"about a one-minute sample and I- I hope you like it."},{"from":2574.48,"to":2646.39,"location":2,"content":"Thanks. [MUSIC]"},{"from":2646.39,"to":2647.72,"location":2,"content":"Thank you for listening."},{"from":2647.72,"to":2658.61,"location":2,"content":"[APPLAUSE]."},{"from":2658.61,"to":2663.83,"location":2,"content":"[LAUGHTER] Thanks, Anna. Um, um, great."},{"from":2663.83,"to":2666.97,"location":2,"content":"Um, so to sort to, um,"},{"from":2666.97,"to":2671.61,"location":2,"content":"so relative attention has been a powerful mechanism for,"},{"from":2671.61,"to":2675.18,"location":2,"content":"um, a very powerful mechanism for music."},{"from":2675.18,"to":2677.28,"location":2,"content":"It's also helped in machine translation."},{"from":2677.28,"to":2679.36,"location":2,"content":"Um, one really interesting, uh,"},{"from":2679.36,"to":2681.55,"location":2,"content":"consequences of, uh, of, um,"},{"from":2681.55,"to":2684.39,"location":2,"content":"one really interesting consequence of relative attention in,"},{"from":2684.39,"to":2686.03,"location":2,"content":"uh, images, is that,"},{"from":2686.03,"to":2688.37,"location":2,"content":"um, like convolutions achieve,"},{"from":2688.37,"to":2690.74,"location":2,"content":"uh, convolutions achieve translational equivariance."},{"from":2690.74,"to":2691.97,"location":2,"content":"So if you have,"},{"from":2691.97,"to":2694.64,"location":2,"content":"let's say, you wa- uh, you have this,"},{"from":2694.64,"to":2698.34,"location":2,"content":"this red dot or this feature that you're computing at this red dot,"},{"from":2698.34,"to":2701.47,"location":2,"content":"it doesn't depend on where the image of the dog is in the image,"},{"from":2701.47,"to":2704.72,"location":2,"content":"is in the the larger image. It just doesn't depend on its absolute location."},{"from":2704.72,"to":2707,"location":2,"content":"It's going to, it's going to produce the same activation."},{"from":2707,"to":2710.91,"location":2,"content":"So you have- convolutions have this nice, uh, translation equivariance."},{"from":2710.91,"to":2713.14,"location":2,"content":"Now, with, with relative,"},{"from":2713.14,"to":2715.22,"location":2,"content":"uh, positions or relative attention,"},{"from":2715.22,"to":2718.55,"location":2,"content":"you get exactly the same effect because you don't have any- once you just"},{"from":2718.55,"to":2722.49,"location":2,"content":"remove this notion of absolute position that you are injecting [NOISE] into the model,"},{"from":2722.49,"to":2724.28,"location":2,"content":"uh, once you've, once you've removed that,"},{"from":2724.28,"to":2726.47,"location":2,"content":"then your attention computation,"},{"from":2726.47,"to":2728.84,"location":2,"content":"because it actually includes I mean, we've,"},{"from":2728.84,"to":2732.22,"location":2,"content":"we've- Niki and I couple of others have actually,"},{"from":2732.22,"to":2734.87,"location":2,"content":"and Anna were actually working on images and seems-"},{"from":2734.87,"to":2737.48,"location":2,"content":"and it seems to actually show, uh, better results."},{"from":2737.48,"to":2742.04,"location":2,"content":"Um, this actio- this now satisfies this,"},{"from":2742.04,"to":2744.44,"location":2,"content":"uh, uh, the- I mean, it,"},{"from":2744.44,"to":2747.47,"location":2,"content":"it can achieve translation equivariance which is a great property for images."},{"from":2747.47,"to":2749.3,"location":2,"content":"So there's a lot of- it seems like this might be"},{"from":2749.3,"to":2751.25,"location":2,"content":"an interesting direction to pursue if you want to push,"},{"from":2751.25,"to":2755.09,"location":2,"content":"uh, Self-Attention in images for a self-supervised learning."},{"from":2755.09,"to":2759.78,"location":2,"content":"Um, I guess on, on self-supervised learning so the geni- generative modeling work that,"},{"from":2759.78,"to":2761.45,"location":2,"content":"that I talked about before in,"},{"from":2761.45,"to":2765.32,"location":2,"content":"in itself just having probabilistic models of images is, I mean,"},{"from":2765.32,"to":2766.89,"location":2,"content":"I guess the best model of an image is I,"},{"from":2766.89,"to":2769.58,"location":2,"content":"I go to Google search and I pick up an image and I just give it to you,"},{"from":2769.58,"to":2772.32,"location":2,"content":"but I guess generative models of images are useful because,"},{"from":2772.32,"to":2774.47,"location":2,"content":"if you want to do something like semis-, uh, uh,"},{"from":2774.47,"to":2776.81,"location":2,"content":"self supervised learning where you just pre-train a model on"},{"from":2776.81,"to":2779.38,"location":2,"content":"a lot of- on a lot of unlabeled data then you transfer it."},{"from":2779.38,"to":2782.76,"location":2,"content":"So hopefully, this is gonna help and this is gonna be a part of that machinery."},{"from":2782.76,"to":2786.89,"location":2,"content":"Um, another interesting, uh,"},{"from":2786.89,"to":2790.52,"location":2,"content":"another indus-interesting structure that relative attention allows you to model,"},{"from":2790.52,"to":2791.96,"location":2,"content":"is, uh, is, is kind of a graph."},{"from":2791.96,"to":2793.52,"location":2,"content":"So imagine you have this, uh,"},{"from":2793.52,"to":2796.26,"location":2,"content":"you have this similarity graph where these red edges are,"},{"from":2796.26,"to":2797.6,"location":2,"content":"are this notion of companies,"},{"from":2797.6,"to":2800.18,"location":2,"content":"and the blue edge is a notion of a fruit, uh,"},{"from":2800.18,"to":2804.5,"location":2,"content":"and um, an apple takes these two forms."},{"from":2804.5,"to":2807.14,"location":2,"content":"And, uh, and you could just imagine"},{"from":2807.14,"to":2810.65,"location":2,"content":"relative attention just modeling this- just being able to model,"},{"from":2810.65,"to":2812.28,"location":2,"content":"or being able to- you, you,"},{"from":2812.28,"to":2816.17,"location":2,"content":"yourself being able to impose these different notions of similarity uh,"},{"from":2816.17,"to":2818.38,"location":2,"content":"between, uh, between, uh, different elements."},{"from":2818.38,"to":2820.72,"location":2,"content":"Uh, so if you have like, if you have graph problems, um,"},{"from":2820.72,"to":2823.93,"location":2,"content":"then relative self-attention might be a good fit for you."},{"from":2823.93,"to":2828.53,"location":2,"content":"Um, there's also, there's also a simi- quite a position paper by Battaglia et al from"},{"from":2828.53,"to":2833.93,"location":2,"content":"Deep Mind that talks about relative attention and how it can be used, um, within graphs."},{"from":2833.93,"to":2835.58,"location":2,"content":"So while we're on graphs,"},{"from":2835.58,"to":2838.68,"location":2,"content":"I just wanted to- perhaps might be interesting to connect,"},{"from":2838.68,"to":2841.49,"location":2,"content":"um, uh, of- some, uh,"},{"from":2841.49,"to":2842.81,"location":2,"content":"excellent work that was done on, uh,"},{"from":2842.81,"to":2845.03,"location":2,"content":"on graphs called Message Passing Neural Networks."},{"from":2845.03,"to":2847.26,"location":2,"content":"And it's quite funny, so if you look at,"},{"from":2847.26,"to":2850.73,"location":2,"content":"if you look at the message passing function, um,"},{"from":2850.73,"to":2854.48,"location":2,"content":"what it's saying is you're actually just passing messages between pairs of nodes."},{"from":2854.48,"to":2857.09,"location":2,"content":"So you can just think of self attention as imposing a fully connect- it's"},{"from":2857.09,"to":2859.97,"location":2,"content":"like a bipe- a full, a complete bipartite graph,"},{"from":2859.97,"to":2862.25,"location":2,"content":"and, uh, you're, you're passing messages between,"},{"from":2862.25,"to":2863.75,"location":2,"content":"you're passing messages between nodes."},{"from":2863.75,"to":2866.54,"location":2,"content":"Now message passing, message passing neural networks did exactly that."},{"from":2866.54,"to":2869.42,"location":2,"content":"They were passing messages between nodes as well. And how are they different?"},{"from":2869.42,"to":2871.58,"location":2,"content":"Well, the only way that when- well, mathematically,"},{"from":2871.58,"to":2873.97,"location":2,"content":"they were only different in that message passing was,"},{"from":2873.97,"to":2877.37,"location":2,"content":"was, uh, forcing the messages to be between pairs of nodes,"},{"from":2877.37,"to":2880.79,"location":2,"content":"but just because of the Softmax function where you get interaction between all the nodes,"},{"from":2880.79,"to":2883.18,"location":2,"content":"self attention is like a message passing mechanism,"},{"from":2883.18,"to":2885.47,"location":2,"content":"where the interactions are between all, all nodes."},{"from":2885.47,"to":2887.32,"location":2,"content":"So, uh, they're, they're like,"},{"from":2887.32,"to":2888.8,"location":2,"content":"they're not too far mathematically,"},{"from":2888.8,"to":2891.32,"location":2,"content":"and also the me- the Message Passing Paper introduces"},{"from":2891.32,"to":2894.62,"location":2,"content":"an interesting concept called Multiple Towers that are similar to multi-head attention,"},{"from":2894.62,"to":2896.59,"location":2,"content":"uh, that, that Norman invented."},{"from":2896.59,"to":2901.16,"location":2,"content":"And, uh, it's like you run k copies of these message passing neural networks in parallel."},{"from":2901.16,"to":2903.59,"location":2,"content":"So there's a lot of similarity between existing, you know,"},{"from":2903.59,"to":2907.8,"location":2,"content":"this connects to work that existed before but these connections sort of came in later."},{"from":2907.8,"to":2911.93,"location":2,"content":"Um, we have a graph library where we kind of connected these both,"},{"from":2911.93,"to":2914.15,"location":2,"content":"both these strands message passing and, uh, we,"},{"from":2914.15,"to":2917.49,"location":2,"content":"uh, we put it out in tensor2tensor."},{"from":2917.49,"to":2920.7,"location":2,"content":"Um, so to sort of summarize, um,"},{"from":2920.7,"to":2923.51,"location":2,"content":"the properties that Self-Attention has been able to help"},{"from":2923.51,"to":2926.24,"location":2,"content":"us model is this constant path length between any two,"},{"from":2926.24,"to":2927.87,"location":2,"content":"any two positions, and it's been,"},{"from":2927.87,"to":2929.6,"location":2,"content":"it's been shown to be quite useful in,"},{"from":2929.6,"to":2932.16,"location":2,"content":"in, in, uh, in sequence modeling."},{"from":2932.16,"to":2936.2,"location":2,"content":"This advantage of having unbounded memory not having to pack information in finite,"},{"from":2936.2,"to":2938.36,"location":2,"content":"in, in sort of a finite amount of- in a,"},{"from":2938.36,"to":2939.57,"location":2,"content":"in a fixed amount of space,"},{"from":2939.57,"to":2943.63,"location":2,"content":"uh, where in, in our case our memory essentially grows with the sequences is,"},{"from":2943.63,"to":2947.18,"location":2,"content":"is helps you computationally, uh, it's trivial to parallelize."},{"from":2947.18,"to":2949.11,"location":2,"content":"You can, you can crunch a lot of data, it's uh,"},{"from":2949.11,"to":2952.04,"location":2,"content":"which is useful if you wanna have your large data sets."},{"from":2952.04,"to":2954.28,"location":2,"content":"We found that it can model Self-Similarity."},{"from":2954.28,"to":2956.33,"location":2,"content":"Uh, It seems to be a very natural thing, uh,"},{"from":2956.33,"to":2960.35,"location":2,"content":"a very, a very natural phenomenon if you're dealing with images or music."},{"from":2960.35,"to":2963.2,"location":2,"content":"Also, relative attention allows you to sort of, gives you this added dimension"},{"from":2963.2,"to":2966.08,"location":2,"content":"of being able to model expressive timing and music,"},{"from":2966.08,"to":2967.93,"location":2,"content":"well, this translational equivariance,"},{"from":2967.93,"to":2970.47,"location":2,"content":"uh, it extends naturally to graphs."},{"from":2970.47,"to":2977.03,"location":2,"content":"Um, so this part or everything that I talked so far was about sort of parallel training."},{"from":2977.03,"to":2981.91,"location":2,"content":"Um, so there's a very active area of research now using the Self-Attention models for,"},{"from":2981.91,"to":2983.97,"location":2,"content":"for, for less auto-regressive generation."},{"from":2983.97,"to":2985.79,"location":2,"content":"So notice a- at generation time,"},{"from":2985.79,"to":2987.57,"location":2,"content":"notice that the decoder mask was causal,"},{"from":2987.57,"to":2988.67,"location":2,"content":"we couldn't look into the future."},{"from":2988.67,"to":2991.19,"location":2,"content":"So when we're, when we're generating we're still"},{"from":2991.19,"to":2994.25,"location":2,"content":"generating sequentially left to right on the target side."},{"from":2994.25,"to":2996.84,"location":2,"content":"Um, so, um, and, and,"},{"from":2996.84,"to":2999.17,"location":2,"content":"and, and why, why is generation hard?"},{"from":2999.17,"to":3000.67,"location":2,"content":"Well, because your outputs are multi-modal."},{"from":3000.67,"to":3002.84,"location":2,"content":"I f you had- if you want to translate English to German,"},{"from":3002.84,"to":3004.28,"location":2,"content":"there's multiple ways and,"},{"from":3004.28,"to":3008.41,"location":2,"content":"and, and your, your second word that you're translating will depend on the first word."},{"from":3008.41,"to":3011.61,"location":2,"content":"For example, if you, if you first- the first word that you predict was danke,"},{"from":3011.61,"to":3013.68,"location":2,"content":"then that's going to change the second word that you predict."},{"from":3013.68,"to":3015.67,"location":2,"content":"And if you just predicted them independently,"},{"from":3015.67,"to":3017.62,"location":2,"content":"then you can imagine you can just have all sorts of"},{"from":3017.62,"to":3020.18,"location":2,"content":"permutations of these which will be incorrect."},{"from":3020.18,"to":3022.69,"location":2,"content":"Uh, and the way we actually break modes is"},{"from":3022.69,"to":3024.94,"location":2,"content":"just- or we make decisions is just sequential generation."},{"from":3024.94,"to":3027.7,"location":2,"content":"Once we commit to a word that makes a decision,"},{"from":3027.7,"to":3030.49,"location":2,"content":"and then that nails down what's the next word that you're going to predict."},{"from":3030.49,"to":3034.21,"location":2,"content":"So there's been some, there's been some work on, it's an active research area, uh,"},{"from":3034.21,"to":3036.7,"location":2,"content":"and you can kind of categorize some of these papers like"},{"from":3036.7,"to":3041.74,"location":2,"content":"the non-autogressive transformer of the fast- the third paper, fast decoding."},{"from":3041.74,"to":3043.87,"location":2,"content":"Um, the fourth paper towards a better understanding"},{"from":3043.87,"to":3046,"location":2,"content":"of all Vector Quantized Auto-encoders into this group,"},{"from":3046,"to":3049.26,"location":2,"content":"where they're actually make- doing the decision making in a latent space,"},{"from":3049.26,"to":3053.47,"location":2,"content":"that's being, uh, it's e- either being learned using word alignments,"},{"from":3053.47,"to":3056.86,"location":2,"content":"uh, fertilities, or that's being learned using Auto-encoders."},{"from":3056.86,"to":3059.68,"location":2,"content":"So you make- you do the decision making in latent space,"},{"from":3059.68,"to":3062.28,"location":2,"content":"and then you- once you've made the decisions in latent space,"},{"from":3062.28,"to":3064.03,"location":2,"content":"you assume that all your outputs,"},{"from":3064.03,"to":3065.72,"location":2,"content":"are actually conditionally independent,"},{"from":3065.72,"to":3067.18,"location":2,"content":"given that you've made these decisions."},{"from":3067.18,"to":3068.49,"location":2,"content":"So that's how they actually speed up."},{"from":3068.49,"to":3070.6,"location":2,"content":"There's also- there's ano- there's another paper."},{"from":3070.6,"to":3071.86,"location":2,"content":"The second one is a"},{"from":3071.86,"to":3074.02,"location":2,"content":"paper that does Iterative Refinement."},{"from":3074.02,"to":3077.66,"location":2,"content":"There is also a Blockwise Parallel Decoding paper by Mitchell Stern,"},{"from":3077.66,"to":3080.22,"location":2,"content":"uh, Noam Shazeer, and Jakob Uszkoreit, uh,"},{"from":3080.22,"to":3083.44,"location":2,"content":"where they essentially just run multiple models like, uh,"},{"from":3083.44,"to":3089.44,"location":2,"content":"and rescore using a more- a decode using a faster model and score,"},{"from":3089.44,"to":3091.41,"location":2,"content":"using the more expensive model."},{"from":3091.41,"to":3093.58,"location":2,"content":"So that's how it sort of it speeds it up."},{"from":3093.58,"to":3098.35,"location":2,"content":"Um, [NOISE] transfer learning has had the- Self-Attention has been beneficial in transfer"},{"from":3098.35,"to":3102.82,"location":2,"content":"learning, GPT from OpenAI and BERT are two classic examples."},{"from":3102.82,"to":3104.99,"location":2,"content":"There's been some work on actually, scaling this up,"},{"from":3104.99,"to":3108.09,"location":2,"content":"like add a factor as, uh, efficient optimizer."},{"from":3108.09,"to":3112.12,"location":2,"content":"Um, there's a, there's a recent paper by Rohan Anil and Yoram Singer."},{"from":3112.12,"to":3114.45,"location":2,"content":"Um, there's also Mesh-Tensorflow,"},{"from":3114.45,"to":3117.85,"location":2,"content":"which actually they've been able to train models"},{"from":3117.85,"to":3122.53,"location":2,"content":"of just several orders of magnitude larger than the original models have been trained."},{"from":3122.53,"to":3125.71,"location":2,"content":"So there's, I mean, when you're working this large data regime you would probably want to"},{"from":3125.71,"to":3127.72,"location":2,"content":"memorize a lot of- you want to memorize"},{"from":3127.72,"to":3130.27,"location":2,"content":"a lot of things inside your parameters used to train a larger model."},{"from":3130.27,"to":3132.41,"location":2,"content":"Uh, Mesh-Tensorflow can uh, can let you do that."},{"from":3132.41,"to":3136.3,"location":2,"content":"Um, there has been a lot of interesting work, universal transformers,"},{"from":3136.3,"to":3139.24,"location":2,"content":"sort of recurrent neural networks can actually count very nicely."},{"from":3139.24,"to":3141.91,"location":2,"content":"There's these cute papers by Schmidhuber where he actually shows"},{"from":3141.91,"to":3145.48,"location":2,"content":"that recurring neural, the count- the cell mechanism just learns a nice counter,"},{"from":3145.48,"to":3147.34,"location":2,"content":"like if you're- you can learn kind of a to the n,"},{"from":3147.34,"to":3149.23,"location":2,"content":"b to the n, uh, with LSTM."},{"from":3149.23,"to":3151.74,"location":2,"content":"So then, uh, universals transformers"},{"from":3151.74,"to":3154.66,"location":2,"content":"brings back recurrence in depth inside the transformer."},{"from":3154.66,"to":3157.2,"location":2,"content":"Uh, there is a really cool Wikipedia paper,"},{"from":3157.2,"to":3160.9,"location":2,"content":"um, simultaneously with the image transformer paper that also uses local attention."},{"from":3160.9,"to":3166.06,"location":2,"content":"Transformer-XL paper that sort of combines recurrence with Self-Attention,"},{"from":3166.06,"to":3167.29,"location":2,"content":"so they do Self-Attention in chunks,"},{"from":3167.29,"to":3170.28,"location":2,"content":"but they sort of summarize history by using recurrence, it's kinda cute."},{"from":3170.28,"to":3172.14,"location":2,"content":"It's been used in speech but I don't know if there's been"},{"from":3172.14,"to":3175.32,"location":2,"content":"some fairly big success stories of Self-Attention in speech."},{"from":3175.32,"to":3178.35,"location":2,"content":"Uh, again, similar issues where you have very large, uh,"},{"from":3178.35,"to":3180.9,"location":2,"content":"um as positions to,"},{"from":3180.9,"to":3183.16,"location":2,"content":"uh, to do Self-Attention over."},{"from":3183.16,"to":3188.05,"location":2,"content":"So yeah, um, self supervision is a- if it works it would be,"},{"from":3188.05,"to":3189.64,"location":2,"content":"it would be, it would be very beneficial."},{"from":3189.64,"to":3192.91,"location":2,"content":"We wouldn't need large label datasets, understanding transfer,"},{"from":3192.91,"to":3195.49,"location":2,"content":"transfers is becoming very succe- becoming- is becoming"},{"from":3195.49,"to":3198.96,"location":2,"content":"a reality in NLP with BERT and some of these other models."},{"from":3198.96,"to":3201.63,"location":2,"content":"So understanding how these, what's actually happening is a-"},{"from":3201.63,"to":3204.55,"location":2,"content":"is an interesting area of ongoing research for me and a couple."},{"from":3204.55,"to":3209.51,"location":2,"content":"And a few of my collaborators and uh, multitask learning and surmounting this,"},{"from":3209.51,"to":3212.86,"location":2,"content":"this quadratic problem with Self-Attention is"},{"from":3212.86,"to":3218.15,"location":2,"content":"an interesting area of research that I- that I'd like to pursue. Thank you."}]} \ No newline at end of file diff --git a/bcc-en/15.bcc b/bcc-en/15.bcc new file mode 100644 index 0000000000000000000000000000000000000000..7a0592c030a0f5b117df40db4c3959fd8ef75301 --- /dev/null +++ b/bcc-en/15.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.76,"to":9.99,"location":2,"content":"So today we're gonna be learning about Natural Language Generation."},{"from":9.99,"to":12.46,"location":2,"content":"And uh, this is probably going to be a little different to"},{"from":12.46,"to":16.38,"location":2,"content":"my previous lectures because this is going to be much more of a kind of survey,"},{"from":16.38,"to":17.75,"location":2,"content":"of lots of cutting edge, uh,"},{"from":17.75,"to":21.06,"location":2,"content":"research topics that are happening in NLG right now."},{"from":21.06,"to":22.8,"location":2,"content":"So before we get to that, uh,"},{"from":22.8,"to":24.32,"location":2,"content":"we've got a few announcements."},{"from":24.32,"to":26.37,"location":2,"content":"Uh, so I guess the main announcement is just,"},{"from":26.37,"to":28.29,"location":2,"content":"thank you all so much for your hard work."},{"from":28.29,"to":31.02,"location":2,"content":"I know, um, the last week or two have been pretty tough."},{"from":31.02,"to":33.45,"location":2,"content":"Uh, assignment five was really quite difficult,"},{"from":33.45,"to":35.3,"location":2,"content":"I think, and it was a challenge to do it in eight days."},{"from":35.3,"to":38,"location":2,"content":"So we just really appreciate all the hard work you've put in."},{"from":38,"to":40.91,"location":2,"content":"Um, we also understand the project proposal was,"},{"from":40.91,"to":45.59,"location":2,"content":"uh, sometimes a bit difficult to understand the expectations for some people."},{"from":45.59,"to":47.99,"location":2,"content":"Um, so, yeah, these are both new components of"},{"from":47.99,"to":50.47,"location":2,"content":"the class this year that were not present last year."},{"from":50.47,"to":52.12,"location":2,"content":"Um, so you know,"},{"from":52.12,"to":55.27,"location":2,"content":"we have to go through some learning curves as well as the teaching staff."},{"from":55.27,"to":57.71,"location":2,"content":"So just we really want to say thank you so much, uh,"},{"from":57.71,"to":60.37,"location":2,"content":"for putting everything into this class."},{"from":60.37,"to":63.23,"location":2,"content":"And please do continue to give us your feedback both right"},{"from":63.23,"to":67.08,"location":2,"content":"now and in the end of quarter feedback survey."},{"from":67.08,"to":70.69,"location":2,"content":"Okay, so here's the overview for what we're going to be doing today."},{"from":70.69,"to":73.95,"location":2,"content":"So today we're going to learn about what's happening in the world"},{"from":73.95,"to":76.96,"location":2,"content":"of neural approaches for Natural Language Generation."},{"from":76.96,"to":78.69,"location":2,"content":"Uh, that is a super,"},{"from":78.69,"to":82.06,"location":2,"content":"super broad title, Natural Language Generation."},{"from":82.06,"to":85.69,"location":2,"content":"Um, NLG encompasses a huge variety of research areas"},{"from":85.69,"to":87.8,"location":2,"content":"and pretty much each of those could have had"},{"from":87.8,"to":89.83,"location":2,"content":"their own lectures and we could have taught a whole,"},{"from":89.83,"to":94.03,"location":2,"content":"a whole quarter- quarter's worth of classes on, ah, NLG."},{"from":94.03,"to":97.47,"location":2,"content":"Uh, but we're going to try to cover a selection of things today."},{"from":97.47,"to":101.52,"location":2,"content":"And, um, uh, it's mostly going to be, uh,"},{"from":101.52,"to":102.89,"location":2,"content":"guided by the things which, uh,"},{"from":102.89,"to":106.04,"location":2,"content":"I've seen that I think are cool or interesting or exciting."},{"from":106.04,"to":108.14,"location":2,"content":"So it's by no means going to be comprehensive but"},{"from":108.14,"to":112.07,"location":2,"content":"I hope you're going to enjoy some of the stuff we're going to learn about."},{"from":112.07,"to":116.66,"location":2,"content":"Okay, so in particular we're going to start off by having a recap of what we"},{"from":116.66,"to":121.07,"location":2,"content":"already know about Natural Language Generation to make sure we're on the same page."},{"from":121.07,"to":124.23,"location":2,"content":"And we're also going to learn a little bit extra about decoding algorithms."},{"from":124.23,"to":125.87,"location":2,"content":"So we learned a bit before about, uh,"},{"from":125.87,"to":128.33,"location":2,"content":"greedy decoding and beam search decoding,"},{"from":128.33,"to":130.28,"location":2,"content":"but today we're going to learn some extra information about"},{"from":130.28,"to":133.09,"location":2,"content":"that and some other types of decoding algorithms."},{"from":133.09,"to":135.71,"location":2,"content":"After that we're going to go through, um,"},{"from":135.71,"to":137.57,"location":2,"content":"a pretty quick tour of lots of"},{"from":137.57,"to":141.86,"location":2,"content":"different NLG tasks and a selection of neural approaches to them."},{"from":141.86,"to":145.58,"location":2,"content":"And then after that we're gonna talk about probably the biggest problem in NLG research,"},{"from":145.58,"to":149.94,"location":2,"content":"which is NLG evaluation and why it is such a tricky situation."},{"from":149.94,"to":153.66,"location":2,"content":"And then lastly, we're going to have some concluding thoughts on NLG research."},{"from":153.66,"to":158.26,"location":2,"content":"What are the current trends and where are we going in the future?"},{"from":158.26,"to":167.88,"location":2,"content":"Okay. So, uh, section one, let's do a recap."},{"from":167.88,"to":171.44,"location":2,"content":"Okay, so Natural Language Generation to define it just"},{"from":171.44,"to":175.09,"location":2,"content":"refers to any setting in which we are generating some kind of text."},{"from":175.09,"to":178.38,"location":2,"content":"So for example, NLG is an important sub-component"},{"from":178.38,"to":181.4,"location":2,"content":"of lots of different tasks such as, uh, machine translation,"},{"from":181.4,"to":184.37,"location":2,"content":"we've already met, uh, abstracted summarization,"},{"from":184.37,"to":186.31,"location":2,"content":"we'll learn a bit more about that later, um,"},{"from":186.31,"to":189.76,"location":2,"content":"dialogue both chit-chat and task-based."},{"from":189.76,"to":195.16,"location":2,"content":"Uh, also creative writing tasks such as writing stories and writing poems even."},{"from":195.16,"to":197.42,"location":2,"content":"NLG is also a sub-component of,"},{"from":197.42,"to":199.61,"location":2,"content":"uh, free-form question answering."},{"from":199.61,"to":202.28,"location":2,"content":"So I know a lot of you are doing the SQuAD project right now, uh,"},{"from":202.28,"to":206.03,"location":2,"content":"that is not an NLG task because you're just extracting the answer from the,"},{"from":206.03,"to":207.25,"location":2,"content":"uh, the source document."},{"from":207.25,"to":209.51,"location":2,"content":"But there are other question answering tasks"},{"from":209.51,"to":211.78,"location":2,"content":"that do have a Natural Language Generation component."},{"from":211.78,"to":215.87,"location":2,"content":"Uh, image captioning is another example of,"},{"from":215.87,"to":218.81,"location":2,"content":"uh, a task that has an NLG sub-component."},{"from":218.81,"to":223.63,"location":2,"content":"So NLG is a pretty cool component of a lot of different NLP tasks."},{"from":223.63,"to":225.44,"location":2,"content":"All right, let's go into our recap."},{"from":225.44,"to":227.45,"location":2,"content":"So the first thing I want to recap is,"},{"from":227.45,"to":228.72,"location":2,"content":"uh, what is language modeling?"},{"from":228.72,"to":233.3,"location":2,"content":"Um, I've noticed that some people are a little bit confused about this, I think it, uh,"},{"from":233.3,"to":236.3,"location":2,"content":"might be because the name language modeling sounds like it might mean"},{"from":236.3,"to":240.63,"location":2,"content":"just simply encoding language like representing language using embeddings or something."},{"from":240.63,"to":242.93,"location":2,"content":"So as a reminder language modeling,"},{"from":242.93,"to":244.13,"location":2,"content":"uh, has a more precise meaning."},{"from":244.13,"to":249.25,"location":2,"content":"Language modeling is the task of predicting the next word given the word so far."},{"from":249.25,"to":251.3,"location":2,"content":"So any system which produces"},{"from":251.3,"to":256.42,"location":2,"content":"this conditional probability distribution that does this task is called a Language Model."},{"from":256.42,"to":258.64,"location":2,"content":"And if that language model,"},{"from":258.64,"to":260.02,"location":2,"content":"uh, system is an RNN,"},{"from":260.02,"to":264.35,"location":2,"content":"then we often abbreviate it as RNN-Language Model."},{"from":264.35,"to":267.39,"location":2,"content":"Okay, so I hope, uh, you'll remember that."},{"from":267.39,"to":269.06,"location":2,"content":"Uh, the next thing we're going to recap is do you"},{"from":269.06,"to":271.06,"location":2,"content":"remember what a Conditional Language Model is?"},{"from":271.06,"to":275.04,"location":2,"content":"Uh, the task of Conditional Language Modeling is when you're predicting, uh,"},{"from":275.04,"to":277.91,"location":2,"content":"what word's going to come next but you're also conditioning on"},{"from":277.91,"to":281.58,"location":2,"content":"some other input x as well as all of your words so far."},{"from":281.58,"to":285.64,"location":2,"content":"So to recap some examples of conditional language modeling include, uh,"},{"from":285.64,"to":289.14,"location":2,"content":"machine translation where you're conditioning on the source sentence x,"},{"from":289.14,"to":292.94,"location":2,"content":"uh, summarization you're conditioning on your input text that you're trying to summarize."},{"from":292.94,"to":297.68,"location":2,"content":"Dialogue, you're conditioning on your dialogue history and so on."},{"from":297.68,"to":303.45,"location":2,"content":"Okay, uh, next we're going to quickly recap how do you train an RNN-Language model?"},{"from":303.45,"to":307.76,"location":2,"content":"I guess, it could also be a transformer-based language model or a CNN-based language model,"},{"from":307.76,"to":311.09,"location":2,"content":"now that you know about those, uh, and it could be conditional or it could be not."},{"from":311.09,"to":315.62,"location":2,"content":"So the main thing I want to remind you about is that when you are training the system,"},{"from":315.62,"to":318.41,"location":2,"content":"then you feed in the target sequence that you're trying to"},{"from":318.41,"to":321.62,"location":2,"content":"generate so where it says target sentence from corpus, uh,"},{"from":321.62,"to":324.29,"location":2,"content":"that's saying that you have some sequence that you're trying to"},{"from":324.29,"to":327.79,"location":2,"content":"generate and you feed that into the decoder, the RNN-Language model."},{"from":327.79,"to":331.38,"location":2,"content":"And then it predicts what words are going to come next."},{"from":331.38,"to":335.01,"location":2,"content":"So the super important thing is that during training,"},{"from":335.01,"to":339.43,"location":2,"content":"we're feeding the gold, that is the reference target sentence into the decoder,"},{"from":339.43,"to":341.98,"location":2,"content":"regardless of what the decoder is predicting."},{"from":341.98,"to":346.75,"location":2,"content":"So even if let's say this is a very bad decoder that isn't predicting the correct words,"},{"from":346.75,"to":348.55,"location":2,"content":"uh, it's not, you know, predicting them high at all,"},{"from":348.55,"to":351.79,"location":2,"content":"um, that doesn't matter we still just, um,"},{"from":351.79,"to":355.6,"location":2,"content":"input the targets- the gold target sequence into the decoder."},{"from":355.6,"to":358.66,"location":2,"content":"And, um, I'm emphasizing this because it's going to come up later,"},{"from":358.66,"to":360.89,"location":2,"content":"uh, this training method is called Teacher Forcing."},{"from":360.89,"to":363.17,"location":2,"content":"Which might be a phrase that you have come across elsewhere."},{"from":363.17,"to":365.98,"location":2,"content":"So, yeah, it refers to the fact that the teacher,"},{"from":365.98,"to":369.16,"location":2,"content":"that is kind of like the gold input is- is forcing, uh,"},{"from":369.16,"to":371.26,"location":2,"content":"the language model to use that on every step"},{"from":371.26,"to":374.5,"location":2,"content":"instead of using its own predictions on each step."},{"from":374.5,"to":378.63,"location":2,"content":"So that's how you train a RNN-Language model which might be conditional."},{"from":378.63,"to":380.61,"location":2,"content":"Uh, okay."},{"from":380.61,"to":382.81,"location":2,"content":"So now a recap on decoding algorithms."},{"from":382.81,"to":386.94,"location":2,"content":"So, uh, you've got your trained language model which might be conditional."},{"from":386.94,"to":389.35,"location":2,"content":"The question is how do you use it to generate a text?"},{"from":389.35,"to":392,"location":2,"content":"So the answer is you need a decoding algorithm."},{"from":392,"to":394.64,"location":2,"content":"A decoding algorithm is an algorithm you use to"},{"from":394.64,"to":397.58,"location":2,"content":"generate the text from your trained language model."},{"from":397.58,"to":399.68,"location":2,"content":"So, uh, in the NMT lecture"},{"from":399.68,"to":402.47,"location":2,"content":"a few weeks ago we learned about two different decoding algorithms."},{"from":402.47,"to":405.62,"location":2,"content":"We learned about greedy decoding and beam search."},{"from":405.62,"to":407.8,"location":2,"content":"So let's quickly recap those."},{"from":407.8,"to":411.21,"location":2,"content":"Uh, greedy decoding is a pretty simple algorithm."},{"from":411.21,"to":413.03,"location":2,"content":"On each step you just take what's"},{"from":413.03,"to":415.89,"location":2,"content":"the most probable words according to the language model."},{"from":415.89,"to":419.25,"location":2,"content":"You could deal with the argmax and then use that as the next word,"},{"from":419.25,"to":421.34,"location":2,"content":"you feed it in as the input on the next step."},{"from":421.34,"to":423.86,"location":2,"content":"And you just keep going until you produce some kind of END"},{"from":423.86,"to":426.62,"location":2,"content":"token or maybe when you reach some maximum length."},{"from":426.62,"to":429.81,"location":2,"content":"And I think you're all quite familiar with this because you did it in assignment five."},{"from":429.81,"to":435.22,"location":2,"content":"So uh, yes this diagram shows how greedy decoding would work to generate the sentence."},{"from":435.22,"to":437.42,"location":2,"content":"So as we learned before,"},{"from":437.42,"to":439.61,"location":2,"content":"due to a kind of lack of backtracking and"},{"from":439.61,"to":442.13,"location":2,"content":"inability to go back if you made a wrong choice, uh,"},{"from":442.13,"to":445.05,"location":2,"content":"the output from greedy decoding is generally, uh,"},{"from":445.05,"to":450.28,"location":2,"content":"pretty poor like it can be ungrammatical, or it can be unnatural, kind of nonsensical."},{"from":450.28,"to":453.27,"location":2,"content":"Okay, let's recap beam search decoding."},{"from":453.27,"to":458.66,"location":2,"content":"So beam search is a search algorithm which aims to find a high probability sequence."},{"from":458.66,"to":463.61,"location":2,"content":"So if we're doing translation that sequence is the sequence of translation words,"},{"from":463.61,"to":468.2,"location":2,"content":"um, by tracking multiple possible sequences at once."},{"from":468.2,"to":471.98,"location":2,"content":"So the core idea is that on each step of the decoder,"},{"from":471.98,"to":473.06,"location":2,"content":"you're going to be keeping track of"},{"from":473.06,"to":477.51,"location":2,"content":"the K most probable partial sequences which we call hypotheses."},{"from":477.51,"to":481,"location":2,"content":"And here K is some hyper- hyper parameter called the beam size."},{"from":481,"to":482.57,"location":2,"content":"So the idea is by um,"},{"from":482.57,"to":485.87,"location":2,"content":"considering lots of different hypotheses we're going to try to search effectively for"},{"from":485.87,"to":487.79,"location":2,"content":"a high probability sequence but there is"},{"from":487.79,"to":490.29,"location":2,"content":"no guarantee that this is going to be the optimal,"},{"from":490.29,"to":492.87,"location":2,"content":"most high probability sequence."},{"from":492.87,"to":495.93,"location":2,"content":"So, uh, at the end of beam search, uh,"},{"from":495.93,"to":497.93,"location":2,"content":"you reach some kind of stopping criterion which we talked"},{"from":497.93,"to":500.3,"location":2,"content":"about before but I won't cover in detail again."},{"from":500.3,"to":502.61,"location":2,"content":"Uh, and once you've reached your stopping criterion,"},{"from":502.61,"to":505.18,"location":2,"content":"you choose the sequence with the highest probability,"},{"from":505.18,"to":509.5,"location":2,"content":"um, factoring in some adjustments for length and then that's your output."},{"from":509.5,"to":511.4,"location":2,"content":"So just to do this one more time."},{"from":511.4,"to":515.43,"location":2,"content":"Here's the diagram that we saw in the NMT lecture of beam search decoding um,"},{"from":515.43,"to":520.02,"location":2,"content":"once it's completed and in this scenario we have a beam size of two."},{"from":520.02,"to":523.18,"location":2,"content":"So this is what it looks like after we've done this exploration problem,"},{"from":523.18,"to":525.42,"location":2,"content":"this shows the full tree that we explored,"},{"from":525.42,"to":529.14,"location":2,"content":"and then we've come to some kind of stopping criterion and we identify the top,"},{"from":529.14,"to":530.48,"location":2,"content":"uh, hypothesis and, uh,"},{"from":530.48,"to":532.51,"location":2,"content":"that's highlighted in green."},{"from":532.51,"to":535.82,"location":2,"content":"So on the subject of beam search decoding,"},{"from":535.82,"to":537.81,"location":2,"content":"I was watching TV the other day,"},{"from":537.81,"to":540.71,"location":2,"content":"and I notice something in Westworld."},{"from":540.71,"to":546.02,"location":2,"content":"I think the hosts- [LAUGHTER] the AI hosts in Westworld maybe used beam search."},{"from":546.02,"to":548.84,"location":2,"content":"Which is something I wasn't expecting to see on TV."},{"from":548.84,"to":550.95,"location":2,"content":"[LAUGHTER] So there's this scene,"},{"from":550.95,"to":552.39,"location":2,"content":"uh, Westworld is, by the way,"},{"from":552.39,"to":554.24,"location":2,"content":"a sci-fi series that has these, um,"},{"from":554.24,"to":556.58,"location":2,"content":"very convincing humanoid AI systems."},{"from":556.58,"to":559.04,"location":2,"content":"Um, and there's a scene where one of"},{"from":559.04,"to":562.05,"location":2,"content":"the AI systems is confronted with the reality of the fact that,"},{"from":562.05,"to":563.98,"location":2,"content":"um, she, I suppose is,"},{"from":563.98,"to":569.85,"location":2,"content":"um, not human because she sees the generation system of words as she says them,"},{"from":569.85,"to":571.68,"location":2,"content":"and I was looking at the TV and I thought,"},{"from":571.68,"to":572.77,"location":2,"content":"is that beam search?"},{"from":572.77,"to":575.84,"location":2,"content":"Because that diagram looks a lot like this diagram here,"},{"from":575.84,"to":578.58,"location":2,"content":"um, but maybe with a bigger beam size."},{"from":578.58,"to":580.61,"location":2,"content":"So, I thought, that was pretty cool because, you know,"},{"from":580.61,"to":583.49,"location":2,"content":"AI has hit the mainstream when you see beam search on TV."},{"from":583.49,"to":585.2,"location":2,"content":"And then if you zoom in really hard you can see"},{"from":585.2,"to":589.47,"location":2,"content":"some other exciting words in this screenshot like knowledge base,"},{"from":589.47,"to":591.17,"location":2,"content":"forward chaining and backward chaining,"},{"from":591.17,"to":594.2,"location":2,"content":"identifies the same thing as forward prop and backward prop,"},{"from":594.2,"to":597.18,"location":2,"content":"um, and also fuzzy logic algorithms and neural net."},{"from":597.18,"to":599.39,"location":2,"content":"Um, so yeah, beam search,"},{"from":599.39,"to":600.88,"location":2,"content":"I think, has hit the mainstream now,"},{"from":600.88,"to":604.1,"location":2,"content":"um, so it's good enough for Westworld,"},{"from":604.1,"to":605.06,"location":2,"content":"maybe it's good enough for us."},{"from":605.06,"to":608.5,"location":2,"content":"Uh, so with beam search, right?"},{"from":608.5,"to":612.05,"location":2,"content":"We've talked about how you have this hyperparameter k or the beam size."},{"from":612.05,"to":614.11,"location":2,"content":"And one thing we didn't talk about in the last lecture,"},{"from":614.11,"to":616.66,"location":2,"content":"so now we're leaving the recap portion, um,"},{"from":616.66,"to":620.98,"location":2,"content":"is what's the effect of changing that beam size k. So, uh,"},{"from":620.98,"to":622.48,"location":2,"content":"if you have a really small k,"},{"from":622.48,"to":626.07,"location":2,"content":"then you're gonna have similar problems to greedy decoding."},{"from":626.07,"to":627.37,"location":2,"content":"And in fact, if k equals one,"},{"from":627.37,"to":629.89,"location":2,"content":"then you are actually just doing greedy decoding."},{"from":629.89,"to":632.41,"location":2,"content":"So those same problems are, you know, ungrammatical,"},{"from":632.41,"to":636.8,"location":2,"content":"maybe unnatural, nonsensical, just kind of plain incorrect output."},{"from":636.8,"to":639.41,"location":2,"content":"So once if we get larger k,"},{"from":639.41,"to":641.3,"location":2,"content":"if you have a larger beam size,"},{"from":641.3,"to":646.75,"location":2,"content":"then you're doing your search algorithm but considering more hypotheses, right?"},{"from":646.75,"to":648.61,"location":2,"content":"You're, you're having a larger search space and"},{"from":648.61,"to":651.1,"location":2,"content":"you're considering more different possibilities."},{"from":651.1,"to":655.5,"location":2,"content":"So if you do that, then we often find that this reduces some of the problems above."},{"from":655.5,"to":658.54,"location":2,"content":"So you're much less likely to have this ungrammatical,"},{"from":658.54,"to":661.01,"location":2,"content":"uh, you know, disjointed output."},{"from":661.01,"to":664.93,"location":2,"content":"But there are some downsides to raising k. So of course,"},{"from":664.93,"to":666.97,"location":2,"content":"larger k is more computationally expensive"},{"from":666.97,"to":669.25,"location":2,"content":"and that can get pretty bad if you're trying to, um,"},{"from":669.25,"to":671.53,"location":2,"content":"for example, generate your, uh,"},{"from":671.53,"to":672.85,"location":2,"content":"outputs for a large, you know,"},{"from":672.85,"to":675.48,"location":2,"content":"test set of NMT examples."},{"from":675.48,"to":677.68,"location":2,"content":"Um, but more seriously than that,"},{"from":677.68,"to":679.87,"location":2,"content":"increasing k can introduce some other problems."},{"from":679.87,"to":683.25,"location":2,"content":"So for example, it's been shown that in NMT,"},{"from":683.25,"to":688.03,"location":2,"content":"increasing the beam size too much actually decreases the BLEU score."},{"from":688.03,"to":690.63,"location":2,"content":"And this is kind of counter-intuitive, right?"},{"from":690.63,"to":692.88,"location":2,"content":"Because we were thinking of beam search"},{"from":692.88,"to":695.41,"location":2,"content":"as this algorithm that tries to find the optimal solution."},{"from":695.41,"to":697.01,"location":2,"content":"So surely, if you increase k,"},{"from":697.01,"to":699.97,"location":2,"content":"then you're only going to find a better solution, right?"},{"from":699.97,"to":704.44,"location":2,"content":"Um, so I think maybe the key here is the difference between optimality"},{"from":704.44,"to":706.3,"location":2,"content":"in terms of the search problem that is finding"},{"from":706.3,"to":708.89,"location":2,"content":"a high probability sequence and BLEU score,"},{"from":708.89,"to":710.08,"location":2,"content":"which are two separate things,"},{"from":710.08,"to":714.31,"location":2,"content":"and there's no guarantee that they actually, um, correspond, right?"},{"from":714.31,"to":717.85,"location":2,"content":"And I mean, there's a difference, again, between BLEU score and actual translation,"},{"from":717.85,"to":719.44,"location":2,"content":"uh, quality as we know."},{"from":719.44,"to":721.72,"location":2,"content":"So if you look at the two papers which I've linked to"},{"from":721.72,"to":724.39,"location":2,"content":"here which are the ones that show that,"},{"from":724.39,"to":727.33,"location":2,"content":"uh, increasing beam size too much decreases the BLEU score."},{"from":727.33,"to":730.69,"location":2,"content":"They explain it by saying that the main reason why this"},{"from":730.69,"to":734.37,"location":2,"content":"happens is because when you increase the beam size too much,"},{"from":734.37,"to":738.37,"location":2,"content":"then you end up producing translations that are too short."},{"from":738.37,"to":742.72,"location":2,"content":"So I mean, that kind of explains it to a degree that translations are too short,"},{"from":742.72,"to":744.13,"location":2,"content":"therefore they have low BLEU because they're"},{"from":744.13,"to":746.23,"location":2,"content":"probably missing words that they should contain."},{"from":746.23,"to":749.86,"location":2,"content":"But the question is, why does large beam size gives you short translations?"},{"from":749.86,"to":751.21,"location":2,"content":"I think that's harder to answer."},{"from":751.21,"to":754.98,"location":2,"content":"Wherever, in these two papers, I didn't see an explicit explanation of why."},{"from":754.98,"to":757.92,"location":2,"content":"Um, I think it's possible larger kind of passing,"},{"from":757.92,"to":761.57,"location":2,"content":"we see sometimes with beam search which is when you really increase your, uh,"},{"from":761.57,"to":763.44,"location":2,"content":"search space and make the search much more"},{"from":763.44,"to":766.62,"location":2,"content":"powerful so that it can consider lots of different alternatives."},{"from":766.62,"to":769.62,"location":2,"content":"It can end up finding these high probability,"},{"from":769.62,"to":773.21,"location":2,"content":"um, sequences which aren't actually the thing that you want."},{"from":773.21,"to":775.26,"location":2,"content":"Sure, they're high probabili- probability"},{"from":775.26,"to":777.35,"location":2,"content":"but they're not actually the thing that you wanted."},{"from":777.35,"to":780.55,"location":2,"content":"Um, so another example of that is"},{"from":780.55,"to":783.63,"location":2,"content":"that in open-ended tasks like for example chit-chat dialogue"},{"from":783.63,"to":784.83,"location":2,"content":"where you're trying to just, um,"},{"from":784.83,"to":787.33,"location":2,"content":"say something interesting back to your conversational partner,"},{"from":787.33,"to":790.3,"location":2,"content":"if we use a beam search with a large beam size,"},{"from":790.3,"to":793.5,"location":2,"content":"we find that that can give you some output that is really generic."},{"from":793.5,"to":796.4,"location":2,"content":"Um, and I'll give you an example here to show you what I mean."},{"from":796.4,"to":800.54,"location":2,"content":"So these are examples from a chit-chat,"},{"from":800.54,"to":802.83,"location":2,"content":"uh, dialogue project that I was doing."},{"from":802.83,"to":804.19,"location":2,"content":"So here you've got, uh,"},{"from":804.19,"to":808.33,"location":2,"content":"your human chit-chat partner said something like I mostly eat a fresh and raw diet,"},{"from":808.33,"to":809.78,"location":2,"content":"so I save on groceries."},{"from":809.78,"to":814.03,"location":2,"content":"And then here's what the chat bot said back depending on the beam size."},{"from":814.03,"to":823.59,"location":2,"content":"I will let you read that."},{"from":823.59,"to":827.35,"location":2,"content":"So I would say that this is fairly characteristic of what you see"},{"from":827.35,"to":830.5,"location":2,"content":"happening when you raise and lower the beam size [NOISE]."},{"from":830.5,"to":831.96,"location":2,"content":"When you have a low beam size,"},{"from":831.96,"to":834.7,"location":2,"content":"um, it might be more kind of on topic."},{"from":834.7,"to":837.58,"location":2,"content":"Like here, we can see that eat healthy, eat healthy,"},{"from":837.58,"to":839.71,"location":2,"content":"I am a nurse so I do not eat raw food and so on,"},{"from":839.71,"to":842.34,"location":2,"content":"that kind of relates to what the user said,"},{"from":842.34,"to":844.15,"location":2,"content":"uh, but it's kind of bad English, right?"},{"from":844.15,"to":846.1,"location":2,"content":"There's some repetition and,"},{"from":846.1,"to":848.02,"location":2,"content":"uh, it doesn't always make that much sense, right?"},{"from":848.02,"to":849.58,"location":2,"content":"Um, [NOISE] but then,"},{"from":849.58,"to":850.88,"location":2,"content":"when you raise the beam size,"},{"from":850.88,"to":852.25,"location":2,"content":"then it kind of converges to"},{"from":852.25,"to":857.18,"location":2,"content":"a safe so-called correct response but it's kind of generic and less relevant, right?"},{"from":857.18,"to":859.6,"location":2,"content":"And it's kind of applicable in all scenarios, what do you do for a living."},{"from":859.6,"to":861.97,"location":2,"content":"Um, so the, the,"},{"from":861.97,"to":864.16,"location":2,"content":"the particular dataset I was using here is, uh,"},{"from":864.16,"to":865.24,"location":2,"content":"one called Persona-Chat,"},{"from":865.24,"to":866.44,"location":2,"content":"that I'll tell you more about later."},{"from":866.44,"to":868.24,"location":2,"content":"Um, but it's a,"},{"from":868.24,"to":871.32,"location":2,"content":"it's a chit-chat dialog dataset where each,"},{"from":871.32,"to":875.58,"location":2,"content":"uh, conv- conversational partner has a persona which is a set of traits."},{"from":875.58,"to":877.6,"location":2,"content":"Um, so the reason it keeps talking about being a nurse,"},{"from":877.6,"to":879.16,"location":2,"content":"I think is because it was in the persona."},{"from":879.16,"to":882.34,"location":2,"content":"[NOISE] But the main point here is that, um,"},{"from":882.34,"to":885.68,"location":2,"content":"we kind of have an unfortunate trade off with no,"},{"from":885.68,"to":888.8,"location":2,"content":"with no Goldilocks zone that's very obvious."},{"from":888.8,"to":890.41,"location":2,"content":"I mean, there's, there's a, yeah,"},{"from":890.41,"to":893.29,"location":2,"content":"kind of an unfortunate trade-off between having kind of bad,"},{"from":893.29,"to":896.68,"location":2,"content":"bad output, bad English and just having something very boring."},{"from":896.68,"to":901.32,"location":2,"content":"So this is one of the problems that we get with beam, beam search."},{"from":901.32,"to":903.79,"location":2,"content":"Okay. So we've talked about, uh,"},{"from":903.79,"to":906.45,"location":2,"content":"greedy decoding and beam search. Yes."},{"from":906.45,"to":913,"location":2,"content":"So beam size depending on the [inaudible]"},{"from":913,"to":914.05,"location":2,"content":"The question is, can we have"},{"from":914.05,"to":917.75,"location":2,"content":"an adaptive beam size dependent on the position that you're in?"},{"from":917.75,"to":919.06,"location":2,"content":"You mean like in the sequence?"},{"from":919.06,"to":926.04,"location":2,"content":"Yeah. That is in [inaudible]."},{"from":926.04,"to":929.22,"location":2,"content":"Yeah. I mean, I think I- I might have heard of a research paper that does that?"},{"from":929.22,"to":934.88,"location":2,"content":"That adaptively like raises the capacity of the, the hypothesis space."},{"from":934.88,"to":937.13,"location":2,"content":"I mean, it sounds awkward to implement, uh,"},{"from":937.13,"to":940.99,"location":2,"content":"because, you know, things fitting into a fixed space in your GPU."},{"from":940.99,"to":942.58,"location":2,"content":"Um, but I think that might be possible,"},{"from":942.58,"to":946.23,"location":2,"content":"I suppose you'd would have to learn the criterion on which you increase beam,"},{"from":946.23,"to":949.3,"location":2,"content":"beam size, yeah. Seems possible."},{"from":949.3,"to":951.63,"location":2,"content":"Okay. So we've talked about, uh,"},{"from":951.63,"to":953.37,"location":2,"content":"beam search and greedy decoding."},{"from":953.37,"to":955.99,"location":2,"content":"So here's a new family of decoding"},{"from":955.99,"to":959.1,"location":2,"content":"algorithms which are pretty simple, uh, sampling-based decoding."},{"from":959.1,"to":963.24,"location":2,"content":"So something which I'm calling pure sampling because I didn't know what else to call it."},{"from":963.24,"to":964.86,"location":2,"content":"Um, this is just the,"},{"from":964.86,"to":967.36,"location":2,"content":"the simple sampling method that says that on each, uh,"},{"from":967.36,"to":968.89,"location":2,"content":"timestep of your decoder t,"},{"from":968.89,"to":972.04,"location":2,"content":"you just want to randomly sample from the probability distribution,"},{"from":972.04,"to":973.78,"location":2,"content":"uh, to obtain your next word."},{"from":973.78,"to":975.49,"location":2,"content":"So this is very simple."},{"from":975.49,"to":977.34,"location":2,"content":"It's just like greedy decoding."},{"from":977.34,"to":979.28,"location":2,"content":"But instead of taking the top words,"},{"from":979.28,"to":982.35,"location":2,"content":"instead just sample from that distribution."},{"from":982.35,"to":988.6,"location":2,"content":"So the reason I call this pure sampling was to differentiate it from top-n sampling."},{"from":988.6,"to":990.64,"location":2,"content":"And again, this is actually usually called top-k"},{"from":990.64,"to":993.4,"location":2,"content":"sampling but I already called k the beam size,"},{"from":993.4,"to":996.34,"location":2,"content":"and I didn't want to be confusing, so I'm gonna call it top-n sampling for now."},{"from":996.34,"to":998.93,"location":2,"content":"Um, so the idea here is also pretty simple."},{"from":998.93,"to":1000.59,"location":2,"content":"On each step t,"},{"from":1000.59,"to":1004.03,"location":2,"content":"you want to randomly sample from your probability distribution but"},{"from":1004.03,"to":1008.26,"location":2,"content":"you're gonna restrict to just the top n most probable words."},{"from":1008.26,"to":1010.18,"location":2,"content":"So this is saying that it's,"},{"from":1010.18,"to":1011.43,"location":2,"content":"it's like the simple, you know,"},{"from":1011.43,"to":1016.51,"location":2,"content":"pure sampling method but you want to truncate your probability distribution just to be,"},{"from":1016.51,"to":1019.02,"location":2,"content":"you know, the, the top most probable words."},{"from":1019.02,"to":1023.14,"location":2,"content":"So, uh, the idea here kind of like how beam search, um,"},{"from":1023.14,"to":1026.61,"location":2,"content":"gave you a hyperparameter is kind of go between greedy decoding and,"},{"from":1026.61,"to":1028.93,"location":2,"content":"you know, uh, a very exhaustive search."},{"from":1028.93,"to":1032.03,"location":2,"content":"In the same way here, you've got a hyperparameter n"},{"from":1032.03,"to":1035.34,"location":2,"content":"which can take you between greedy search and pure sampling."},{"from":1035.34,"to":1036.63,"location":2,"content":"If you think about this for a moment,"},{"from":1036.63,"to":1039.15,"location":2,"content":"if n is one, then you would truncate it the top one."},{"from":1039.15,"to":1041.09,"location":2,"content":"So you're just taking arg max which is greedy."},{"from":1041.09,"to":1042.66,"location":2,"content":"And if n is vocab size,"},{"from":1042.66,"to":1044.09,"location":2,"content":"then you don't truncate it at all."},{"from":1044.09,"to":1045.51,"location":2,"content":"You're sampling from everything,"},{"from":1045.51,"to":1047.79,"location":2,"content":"that's just the pure sampling method."},{"from":1047.79,"to":1051,"location":2,"content":"So here, um, it should be clear, I hope,"},{"from":1051,"to":1053.71,"location":2,"content":"if you think about that if you increase n,"},{"from":1053.71,"to":1056.91,"location":2,"content":"then you're gonna get more diverse and risky output, right?"},{"from":1056.91,"to":1059.23,"location":2,"content":"Because you're, uh, giving it more,"},{"from":1059.23,"to":1062.76,"location":2,"content":"more to choose from and you're going lower into the probability distribution,"},{"from":1062.76,"to":1064.77,"location":2,"content":"going lower into less likely things."},{"from":1064.77,"to":1066.27,"location":2,"content":"And then, if you decrease n,"},{"from":1066.27,"to":1068.58,"location":2,"content":"then you're gonna get more kind of generic safe output because you're"},{"from":1068.58,"to":1073.46,"location":2,"content":"restricting more to the most high probability options."},{"from":1073.46,"to":1076.44,"location":2,"content":"So both of these are more efficient than"},{"from":1076.44,"to":1078.63,"location":2,"content":"beam search which I think is something important to note,"},{"from":1078.63,"to":1082.42,"location":2,"content":"uh, because there are no multiple hypotheses to track, right?"},{"from":1082.42,"to":1084.73,"location":2,"content":"Because in beam search, on every step t of the decoder,"},{"from":1084.73,"to":1086.12,"location":2,"content":"you've got k different, you know,"},{"from":1086.12,"to":1088.77,"location":2,"content":"beam size, many hypotheses to track."},{"from":1088.77,"to":1091.56,"location":2,"content":"Uh, whereas here, at least if you're only generating one sample,"},{"from":1091.56,"to":1092.76,"location":2,"content":"there's only one thing to track."},{"from":1092.76,"to":1094.44,"location":2,"content":"So it, it's a very simple algorithm."},{"from":1094.44,"to":1101.2,"location":2,"content":"So that is one advantage of these sampling-based algorithms over beam search."},{"from":1101.2,"to":1105.56,"location":2,"content":"Okay. So, the last thing I want to tell you that's kind of related to decoding is,"},{"from":1105.56,"to":1107.16,"location":2,"content":"uh, softmax [NOISE] temperature."},{"from":1107.16,"to":1110.93,"location":2,"content":"So, if you recall on timestep t of your decoder,"},{"from":1110.93,"to":1114.59,"location":2,"content":"your language model computes some kind of probability distribution P_t, uh,"},{"from":1114.59,"to":1119.03,"location":2,"content":"by applying the softmax function to a vector of scores that you got from somewhere."},{"from":1119.03,"to":1122.73,"location":2,"content":"Like from your transformer or from your RNN or something."},{"from":1122.73,"to":1124.67,"location":2,"content":"So, there's the softmax function again."},{"from":1124.67,"to":1127.58,"location":2,"content":"It's saying that the probability of a word W is this softmax function,"},{"from":1127.58,"to":1130.12,"location":2,"content":"uh, given, given the scores."},{"from":1130.12,"to":1135.08,"location":2,"content":"So, the idea here of a temperature on the softmax is that you have some kind of"},{"from":1135.08,"to":1141.2,"location":2,"content":"temperature hyperparameter tau and you're going to apply that to this, uh, softmax."},{"from":1141.2,"to":1144.92,"location":2,"content":"So, all that we're doing is we're div- dividing all of the scores,"},{"from":1144.92,"to":1146.38,"location":2,"content":"or logits you might call them,"},{"from":1146.38,"to":1148.57,"location":2,"content":"by the temperature hyperparameter."},{"from":1148.57,"to":1150.55,"location":2,"content":"So again, if you just think about this a little bit,"},{"from":1150.55,"to":1152.57,"location":2,"content":"you'll see that raising the temperature,"},{"from":1152.57,"to":1153.8,"location":2,"content":"that is increasing, uh,"},{"from":1153.8,"to":1159.93,"location":2,"content":"the hyperparameter, this is going to make your probability distribution more uniform."},{"from":1159.93,"to":1163.41,"location":2,"content":"And this kind of comes down to the question about when you,"},{"from":1163.41,"to":1165.83,"location":2,"content":"when you multiply all of your scores by a constant,"},{"from":1165.83,"to":1168.98,"location":2,"content":"um, how does that affect the softmax, right?"},{"from":1168.98,"to":1173.88,"location":2,"content":"So, do things get more far apart or less far apart once you take the exponential?"},{"from":1173.88,"to":1176.69,"location":2,"content":"So, this is something you can just work up by yourself on paper,"},{"from":1176.69,"to":1178.52,"location":2,"content":"but as a, uh,"},{"from":1178.52,"to":1180.13,"location":2,"content":"a kind of a memory shortcut,"},{"from":1180.13,"to":1183.5,"location":2,"content":"a good way to think about it is that if you raise the temperature,"},{"from":1183.5,"to":1187.49,"location":2,"content":"then the distribution kind of melts and goes soft and mushy and uniform."},{"from":1187.49,"to":1188.81,"location":2,"content":"And if you, uh,"},{"from":1188.81,"to":1191.15,"location":2,"content":"lower the temperature, like make it cold then,"},{"from":1191.15,"to":1194.69,"location":2,"content":"the probability distribution becomes more spiky, right?"},{"from":1194.69,"to":1199.12,"location":2,"content":"So, like the things which are rated as high probability become like even more,"},{"from":1199.12,"to":1202.67,"location":2,"content":"uh, disproportionately high probability compared to the other things."},{"from":1202.67,"to":1205.54,"location":2,"content":"Um, I think that's a easy way to remember it."},{"from":1205.54,"to":1207.93,"location":2,"content":"Today I had to work it out on paper and then, uh,"},{"from":1207.93,"to":1209.13,"location":2,"content":"I realized that just the, the,"},{"from":1209.13,"to":1212.38,"location":2,"content":"the temperature visualization thing usually gets me there quicker."},{"from":1212.38,"to":1218.48,"location":2,"content":"So, um, one thing I want to note is that softmax temperature is not a decoding algorithm."},{"from":1218.48,"to":1221.12,"location":2,"content":"I know that I put it in the decoding algorithm section,"},{"from":1221.12,"to":1223.71,"location":2,"content":"uh, that was just because it's kind of a thing, a"},{"from":1223.71,"to":1229.88,"location":2,"content":"simple thing that you can do at test time to change how the decoding happens, right?"},{"from":1229.88,"to":1231.32,"location":2,"content":"You don't need to train, uh,"},{"from":1231.32,"to":1233.77,"location":2,"content":"with the, the softmax temperature."},{"from":1233.77,"to":1236.22,"location":2,"content":"So, it's not a decoding algorithm itself."},{"from":1236.22,"to":1238.41,"location":2,"content":"It's a technique that you can apply at test time"},{"from":1238.41,"to":1241.04,"location":2,"content":"in conjunction with a decoding algorithm."},{"from":1241.04,"to":1244.38,"location":2,"content":"So, for example, if you're doing beam search or you're doing some kind of sampling,"},{"from":1244.38,"to":1248.62,"location":2,"content":"then you can also apply a softmax temperature, um, to change,"},{"from":1248.62,"to":1255.22,"location":2,"content":"you know, this kind of risky versus safe, um, trade-off."},{"from":1255.22,"to":1263.06,"location":2,"content":"Any questions on this? Okay. So, here's"},{"from":1263.06,"to":1266.27,"location":2,"content":"a summary of what we just learned about decoding algorithms."},{"from":1266.27,"to":1269.38,"location":2,"content":"Um, Greedy decoding is a simple method."},{"from":1269.38,"to":1274.27,"location":2,"content":"It gives kind of low quality output in comparison to the others, at least beam search."},{"from":1274.27,"to":1277.16,"location":2,"content":"Beam search, especially when you've got a high beam size, uh,"},{"from":1277.16,"to":1280.95,"location":2,"content":"it searches through lots of different hypotheses for high-probability outputs."},{"from":1280.95,"to":1284.12,"location":2,"content":"And this generally is gonna deliver better quality than greedy search, uh,"},{"from":1284.12,"to":1286.73,"location":2,"content":"but if the beam size is too high, then you can have these,"},{"from":1286.73,"to":1289.38,"location":2,"content":"uh, kind of counter-intuitive problems we talked about before."},{"from":1289.38,"to":1292.87,"location":2,"content":"Where you've retrieved some kind of high-probability but unsuitable output."},{"from":1292.87,"to":1295.41,"location":2,"content":"Say, something is too generic or something is too short."},{"from":1295.41,"to":1297.29,"location":2,"content":"And we're gonna talk about that more later."},{"from":1297.29,"to":1301.22,"location":2,"content":"Uh, sampling methods are a way to get more diversity,"},{"from":1301.22,"to":1303.1,"location":2,"content":"uh, via, via randomness."},{"from":1303.1,"to":1306.38,"location":2,"content":"Uh, well, getting randomness might be your goal in itself."},{"from":1306.38,"to":1309.48,"location":2,"content":"Um, so, this is good if you want to have some kind of, for example,"},{"from":1309.48,"to":1311.93,"location":2,"content":"open-ended or creative generation setting like,"},{"from":1311.93,"to":1313.91,"location":2,"content":"uh, generating poetry or stories,"},{"from":1313.91,"to":1316.37,"location":2,"content":"then sampling is probably a better idea than"},{"from":1316.37,"to":1319.7,"location":2,"content":"beam search because you want to have a kind of source of randomness to,"},{"from":1319.7,"to":1322.16,"location":2,"content":"uh, write different things creatively."},{"from":1322.16,"to":1327.17,"location":2,"content":"And top-n sampling allows you to control the diversity by,"},{"from":1327.17,"to":1329.33,"location":2,"content":"uh, changing n. And then lastly,"},{"from":1329.33,"to":1331.61,"location":2,"content":"softmax temperature is another way to control diversity."},{"from":1331.61,"to":1334.52,"location":2,"content":"So there's quite a few different knobs you can turn here."},{"from":1334.52,"to":1336.26,"location":2,"content":"And it's not a decoding algorithm,"},{"from":1336.26,"to":1340.19,"location":2,"content":"it's just a technique that you can apply alongside any decoding algorithm."},{"from":1340.19,"to":1342.83,"location":2,"content":"Although it wouldn't make sense to apply it with"},{"from":1342.83,"to":1346.37,"location":2,"content":"greedy decoding because even if you make it more spiky or more flat,"},{"from":1346.37,"to":1351.4,"location":2,"content":"the argmax is still the argmax, so it doesn't make sense."},{"from":1351.4,"to":1354.35,"location":2,"content":"Okay. Cool. I'm going to move on to section two."},{"from":1354.35,"to":1359.13,"location":2,"content":"So, uh, section two is NLG tasks and neural approaches to them."},{"from":1359.13,"to":1362.33,"location":2,"content":"Uh, as mentioned before, this is not going to be an overview of all of NLG."},{"from":1362.33,"to":1363.62,"location":2,"content":"That will be quite impossible."},{"from":1363.62,"to":1365.19,"location":2,"content":"This is gonna be some selected highlights."},{"from":1365.19,"to":1367.49,"location":2,"content":"So, in particular, I'm gonna start off with"},{"from":1367.49,"to":1371.27,"location":2,"content":"a fairly deep dive into a particular NLG task that I'm a bit more familiar with,"},{"from":1371.27,"to":1373.25,"location":2,"content":"and that is, uh, summarization."},{"from":1373.25,"to":1377.66,"location":2,"content":"So, let's start off with a task definition for summarization."},{"from":1377.66,"to":1382.33,"location":2,"content":"Um, one sensible definition would be: Given some kind of input text x,"},{"from":1382.33,"to":1384.89,"location":2,"content":"you want to write a summary y which is shorter than"},{"from":1384.89,"to":1387.83,"location":2,"content":"x and contains the main information of x."},{"from":1387.83,"to":1391.36,"location":2,"content":"So, summarization can be single-document or multi-document."},{"from":1391.36,"to":1396.51,"location":2,"content":"Uh, single-document means that you just have a summary y of a single document x."},{"from":1396.51,"to":1400.04,"location":2,"content":"In multi-document summarization, you're saying that you want to write"},{"from":1400.04,"to":1404.39,"location":2,"content":"a single summary y of multiple documents x_1 up to x_n."},{"from":1404.39,"to":1408.98,"location":2,"content":"And here typically x_1 up to x_n will have some kind of overlapping content."},{"from":1408.98,"to":1412.04,"location":2,"content":"So, for example, they might all be different news articles"},{"from":1412.04,"to":1415.22,"location":2,"content":"from different newspapers about the same event, right?"},{"from":1415.22,"to":1419.03,"location":2,"content":"Because it kind of makes sense to write a single summary that draws from all of those."},{"from":1419.03,"to":1425.92,"location":2,"content":"Um, makes less sense to summarize things that are about different topics."},{"from":1425.92,"to":1428.02,"location":2,"content":"There is further, uh,"},{"from":1428.02,"to":1431.27,"location":2,"content":"subdivision of, uh, task definitions in, in summarization."},{"from":1431.27,"to":1433.84,"location":2,"content":"So, I'm gonna describe it via some datasets."},{"from":1433.84,"to":1438.45,"location":2,"content":"Uh, here are some different really common datasets especially in, uh,"},{"from":1438.45,"to":1441.8,"location":2,"content":"neural summarization, um, and they kind of correspond to different,"},{"from":1441.8,"to":1444.04,"location":2,"content":"like, lengths and different styles of text."},{"from":1444.04,"to":1445.43,"location":2,"content":"So, a common one is,"},{"from":1445.43,"to":1447.05,"location":2,"content":"uh, the Gigaword dataset."},{"from":1447.05,"to":1449.36,"location":2,"content":"And the task here is that you want to map from"},{"from":1449.36,"to":1453.71,"location":2,"content":"the first one or two sentences of a news article to write the headline."},{"from":1453.71,"to":1456.29,"location":2,"content":"[NOISE] And you could think of this as sentence compression,"},{"from":1456.29,"to":1459.14,"location":2,"content":"especially if it's kind of one sentence to headline because you're going from"},{"from":1459.14,"to":1462.71,"location":2,"content":"a longish sentence to a shortish headline style sentence."},{"from":1462.71,"to":1466.95,"location":2,"content":"Uh, next one that I, um,"},{"from":1466.95,"to":1469.13,"location":2,"content":"wanted to tell you about is this, uh,"},{"from":1469.13,"to":1471.32,"location":2,"content":"it's a Chinese summarization dataset but I,"},{"from":1471.32,"to":1473.69,"location":2,"content":"I see people using it a lot."},{"from":1473.69,"to":1476.48,"location":2,"content":"And it's, uh, from a micro-blogging,"},{"from":1476.48,"to":1479.94,"location":2,"content":"um, website where people write summaries of their posts."},{"from":1479.94,"to":1482.27,"location":2,"content":"So, the actual summarization task is"},{"from":1482.27,"to":1484.79,"location":2,"content":"you've got some paragraph of text and then you want to,"},{"from":1484.79,"to":1486.23,"location":2,"content":"uh, summarize that into,"},{"from":1486.23,"to":1488.18,"location":2,"content":"I think, a single sentence summary."},{"from":1488.18,"to":1491.12,"location":2,"content":"Uh, another one, uh, two actually,"},{"from":1491.12,"to":1495.65,"location":2,"content":"are the New York Times and CNN/Daily Mail, uh, datasets."},{"from":1495.65,"to":1497.18,"location":2,"content":"So, these ones are both of the form,"},{"from":1497.18,"to":1499.94,"location":2,"content":"you've got a whole news article which is actually pretty long like"},{"from":1499.94,"to":1503.69,"location":2,"content":"hun-hundreds of words and then you want to summarize that into,"},{"from":1503.69,"to":1506.84,"location":2,"content":"uh, like, maybe a single-sentence or multi-sentence summary."},{"from":1506.84,"to":1510.56,"location":2,"content":"Uh, The New York Times ones are written by, I think, uh,"},{"from":1510.56,"to":1513.13,"location":2,"content":"librarians or people who, who,"},{"from":1513.13,"to":1516.44,"location":2,"content":"um, write summaries for, for library purposes."},{"from":1516.44,"to":1518.88,"location":2,"content":"Uh, and then, uh,"},{"from":1518.88,"to":1522.37,"location":2,"content":"one I just spotted today when I was writing this list is there's a new,"},{"from":1522.37,"to":1525.85,"location":2,"content":"fairly new like last six months dataset from wikiHow."},{"from":1525.85,"to":1527.84,"location":2,"content":"So, from what I can tell this seems to be,"},{"from":1527.84,"to":1531.95,"location":2,"content":"you've got a full how-to-article from wikiHow and then you want to boil this down to"},{"from":1531.95,"to":1534.2,"location":2,"content":"the summary sentences which are kind of cleverly"},{"from":1534.2,"to":1537.18,"location":2,"content":"extracted from throughout the wikiHow article."},{"from":1537.18,"to":1538.79,"location":2,"content":"They are kind of like headings."},{"from":1538.79,"to":1542.39,"location":2,"content":"So, um, I looked at this paper and it seems that, um,"},{"from":1542.39,"to":1545.84,"location":2,"content":"this is kind of interesting because it's a different type of text."},{"from":1545.84,"to":1548.99,"location":2,"content":"As you might have noticed most of the other ones are news-based and this is,"},{"from":1548.99,"to":1551.83,"location":2,"content":"uh, not, so that kind of poses different challenges."},{"from":1551.83,"to":1557.36,"location":2,"content":"Uh, another kind of division of summarization is sentence simplification."},{"from":1557.36,"to":1560.69,"location":2,"content":"So, this is a related but actually different task."},{"from":1560.69,"to":1564.41,"location":2,"content":"In summarization, you want to write something which is shorter and contains"},{"from":1564.41,"to":1568.22,"location":2,"content":"main information but is still maybe written in just as complex language,"},{"from":1568.22,"to":1573.79,"location":2,"content":"whereas in sentence simplification you want to rewrite the source text using simpler,"},{"from":1573.79,"to":1575.42,"location":2,"content":"uh, simpler language, right?"},{"from":1575.42,"to":1578.77,"location":2,"content":"So, like simpler word choices and simpler sentence structure."},{"from":1578.77,"to":1581.24,"location":2,"content":"That might mean it's shorter but not necessarily."},{"from":1581.24,"to":1582.89,"location":2,"content":"So, for example, uh,"},{"from":1582.89,"to":1585.95,"location":2,"content":"simple Wiki- Wikipedia is a standard dataset for this."},{"from":1585.95,"to":1588.47,"location":2,"content":"And the idea is you've got, um, you know,"},{"from":1588.47,"to":1591.44,"location":2,"content":"standard Wikipedia and you've got a simple Wikipedia version."},{"from":1591.44,"to":1592.55,"location":2,"content":"And they mostly align up,"},{"from":1592.55,"to":1593.96,"location":2,"content":"so you want to map from"},{"from":1593.96,"to":1597.37,"location":2,"content":"some sentence in one to the equivalent sentence in the [NOISE] other."},{"from":1597.37,"to":1601.88,"location":2,"content":"Another source of data for this is Newsela which is a website that,"},{"from":1601.88,"to":1604.09,"location":2,"content":"uh, rewrites news for children."},{"from":1604.09,"to":1606.32,"location":2,"content":"Actually, at different learning levels I think."},{"from":1606.32,"to":1610.18,"location":2,"content":"So, you have multiple options for how much it's simplified."},{"from":1610.18,"to":1614.69,"location":2,"content":"Okay. So, um, so"},{"from":1614.69,"to":1619.09,"location":2,"content":"that's the definition or the many definitions of summarization as different tasks."},{"from":1619.09,"to":1620.81,"location":2,"content":"So, now I'm gonna give an overview of, like,"},{"from":1620.81,"to":1622.19,"location":2,"content":"what are the main, uh,"},{"from":1622.19,"to":1624.1,"location":2,"content":"techniques for doing summarization."},{"from":1624.1,"to":1626.39,"location":2,"content":"So, there's two main strategies for summarization."},{"from":1626.39,"to":1630.56,"location":2,"content":"Uh, you can call them extractive summarization and abstractive summarization."},{"from":1630.56,"to":1632.73,"location":2,"content":"And the main idea as I had hinted out earlier,"},{"from":1632.73,"to":1635.72,"location":2,"content":"is that in extractive summarization you're just selecting"},{"from":1635.72,"to":1639.05,"location":2,"content":"parts of the original texts to form a summary."},{"from":1639.05,"to":1642.77,"location":2,"content":"And often this will be whole sentences but maybe it'll be more granular than that;"},{"from":1642.77,"to":1644.83,"location":2,"content":"maybe, uh, phrases or words."},{"from":1644.83,"to":1647.36,"location":2,"content":"Whereas abstractive summarization, you're going to be"},{"from":1647.36,"to":1651.28,"location":2,"content":"generating some new text using NLG techniques."},{"from":1651.28,"to":1653.84,"location":2,"content":"So the idea is that it's, you know, generation from scratch."},{"from":1653.84,"to":1657.59,"location":2,"content":"And my visual metaphor for this is this kind of like the difference between highlighting"},{"from":1657.59,"to":1663.1,"location":2,"content":"the parts with a highlighter or writing the summary yourself with a pen."},{"from":1663.1,"to":1667.16,"location":2,"content":"I think the high level things to know about these two techniques are that"},{"from":1667.16,"to":1670.61,"location":2,"content":"extractive summarization is basically easier,"},{"from":1670.61,"to":1672.72,"location":2,"content":"at least to make a decent system to start,"},{"from":1672.72,"to":1677.12,"location":2,"content":"because selecting things is probably easier than writing text from scratch."},{"from":1677.12,"to":1680.94,"location":2,"content":"Um, but extractive summarization is pretty restrictive, right?"},{"from":1680.94,"to":1682.76,"location":2,"content":"Because you can't really paraphrase anything,"},{"from":1682.76,"to":1685.43,"location":2,"content":"you can't really do any powerful sentence compression"},{"from":1685.43,"to":1688.47,"location":2,"content":"if you can only just select sentences."},{"from":1688.47,"to":1692.19,"location":2,"content":"Um, and, of course, abstractive summarization as a paradigm"},{"from":1692.19,"to":1695.64,"location":2,"content":"is more flexible and it's more how humans might summarize,"},{"from":1695.64,"to":1698.15,"location":2,"content":"uh, but as noted it's pretty difficult."},{"from":1698.15,"to":1703.84,"location":2,"content":"So, I'm gonna give you a very quick view of what pre-neural summarization looks like."},{"from":1703.84,"to":1704.94,"location":2,"content":"And here we've got, uh,"},{"from":1704.94,"to":1706.7,"location":2,"content":"this is a diagram from the, uh,"},{"from":1706.7,"to":1708.8,"location":2,"content":"Speech and Language Processing book."},{"from":1708.8,"to":1713.12,"location":2,"content":"So, uh, pre-neural summarization systems were mostly extractive."},{"from":1713.12,"to":1715.37,"location":2,"content":"And like pre-neural NMT,"},{"from":1715.37,"to":1717.13,"location":2,"content":"which we learnt about in the NMT lecture,"},{"from":1717.13,"to":1720.39,"location":2,"content":"it typically had a pipeline which is what this picture is showing."},{"from":1720.39,"to":1723.07,"location":2,"content":"So, a typical pipeline might have three parts."},{"from":1723.07,"to":1726.17,"location":2,"content":"First, you have content selection which is, uh,"},{"from":1726.17,"to":1729.79,"location":2,"content":"essentially choosing some of the sentences from the source document to include."},{"from":1729.79,"to":1732.15,"location":2,"content":"And then secondly, you're going to do some kind of information"},{"from":1732.15,"to":1736.05,"location":2,"content":"ordering which means choosing what order should I put these sentences in."},{"from":1736.05,"to":1739.75,"location":2,"content":"And this is particularly a more nontrivial question if you were"},{"from":1739.75,"to":1741.58,"location":2,"content":"doing multiple document summarization"},{"from":1741.58,"to":1743.56,"location":2,"content":"because your sentences might come from different documents."},{"from":1743.56,"to":1745.06,"location":2,"content":"Uh, and then lastly,"},{"from":1745.06,"to":1748.26,"location":2,"content":"you're going to do a sentence realization that is actually, um,"},{"from":1748.26,"to":1752.13,"location":2,"content":"turning your selected sentences into your actual summary."},{"from":1752.13,"to":1753.68,"location":2,"content":"So, although we're not doing, kind of,"},{"from":1753.68,"to":1755.83,"location":2,"content":"free-form text generation, uh,"},{"from":1755.83,"to":1759.29,"location":2,"content":"there might be some kind of editing for example like, uh, simplifying, editing,"},{"from":1759.29,"to":1761.88,"location":2,"content":"or removing parts that are redundant,"},{"from":1761.88,"to":1763.87,"location":2,"content":"or fixing continuity issues."},{"from":1763.87,"to":1766.22,"location":2,"content":"So for example, you can't refer to"},{"from":1766.22,"to":1768.92,"location":2,"content":"a person as she if you never introduced them in the first place."},{"from":1768.92,"to":1773.18,"location":2,"content":"So maybe you need to change that she to the name of the person."},{"from":1773.18,"to":1775.89,"location":2,"content":"So in particular [NOISE] uh,"},{"from":1775.89,"to":1777.94,"location":2,"content":"these pre-neural summarization systems, uh,"},{"from":1777.94,"to":1781.23,"location":2,"content":"have some pretty sophisticated algorithms of content selection."},{"from":1781.23,"to":1783.45,"location":2,"content":"Um, so, for example,"},{"from":1783.45,"to":1786.24,"location":2,"content":"uh, you would have some sentence scoring functions."},{"from":1786.24,"to":1788.14,"location":2,"content":"This is the most simple, uh, way you might do it,"},{"from":1788.14,"to":1790.77,"location":2,"content":"is you might score all of the sentences individually"},{"from":1790.77,"to":1793.62,"location":2,"content":"and you could score them based on features such as,"},{"from":1793.62,"to":1796.65,"location":2,"content":"um, are there, you know, topic keywords in the sentence?"},{"from":1796.65,"to":1799.38,"location":2,"content":"If so, maybe it's an important sentence that we should include."},{"from":1799.38,"to":1802.72,"location":2,"content":"Um, and you could compute those, uh,"},{"from":1802.72,"to":1806.76,"location":2,"content":"keywords using, uh, statistics such as tf-idf for example."},{"from":1806.76,"to":1810.96,"location":2,"content":"[NOISE] You can also use pretty basic but powerful features such as,"},{"from":1810.96,"to":1812.92,"location":2,"content":"uh, where does the sentence appear in the document?"},{"from":1812.92,"to":1814.26,"location":2,"content":"If it's near the top of the document,"},{"from":1814.26,"to":1816.51,"location":2,"content":"then it's more likely to be important."},{"from":1816.51,"to":1818.1,"location":2,"content":"Uh, there are also"},{"from":1818.1,"to":1821.91,"location":2,"content":"some more complex content selection algorithms such as for example, uh,"},{"from":1821.91,"to":1825.42,"location":2,"content":"there are these graph-based algorithms which kind of view the document as"},{"from":1825.42,"to":1829.01,"location":2,"content":"a set of sentences and those sentences are the nodes of the graph,"},{"from":1829.01,"to":1830.76,"location":2,"content":"and you imagine that all sentences, er,"},{"from":1830.76,"to":1833.19,"location":2,"content":"sentence pairs have an edge between them,"},{"from":1833.19,"to":1836.76,"location":2,"content":"and the weight of the edge is kind of how similar the sentences are."},{"from":1836.76,"to":1839.92,"location":2,"content":"So, then, if you think about the graph in that sense,"},{"from":1839.92,"to":1843.6,"location":2,"content":"then now you can try to identify which sentences are"},{"from":1843.6,"to":1847.5,"location":2,"content":"important by finding which sentences are central in the graph."},{"from":1847.5,"to":1849.54,"location":2,"content":"So you can apply some kind of general purpose"},{"from":1849.54,"to":1852.93,"location":2,"content":"gla- graph algorithms to figure out which [NOISE] nodes are central,"},{"from":1852.93,"to":1856.34,"location":2,"content":"and this is a way to find central sentences."},{"from":1856.34,"to":1863.36,"location":2,"content":"Okay. So um, [NOISE] back to summarization as a task."},{"from":1863.36,"to":1866.94,"location":2,"content":"Um, we've, I can't remember if we've talked about ROUGE already."},{"from":1866.94,"to":1868.14,"location":2,"content":"We've certainly talked about BLEU."},{"from":1868.14,"to":1869.82,"location":2,"content":"But I'm gonna tell you about ROUGE now which is"},{"from":1869.82,"to":1872.4,"location":2,"content":"the main automatic metric for summarization."},{"from":1872.4,"to":1877.69,"location":2,"content":"So ROUGE stands for Recall-Oriented Understudy for Gisting Evaluation."},{"from":1877.69,"to":1879.48,"location":2,"content":"I'm not sure if that was the first thing they came up with"},{"from":1879.48,"to":1881.79,"location":2,"content":"or if they made it like that to match BLEU."},{"from":1881.79,"to":1884.61,"location":2,"content":"Um, and here's the,"},{"from":1884.61,"to":1886.05,"location":2,"content":"here's the equation, uh,"},{"from":1886.05,"to":1888.86,"location":2,"content":"for, well, I suppose one of the ROUGE metrics."},{"from":1888.86,"to":1891.21,"location":2,"content":"I'll tell you more about what that means later and you can"},{"from":1891.21,"to":1894.11,"location":2,"content":"read more in the original paper which is linked at the bottom."},{"from":1894.11,"to":1898.1,"location":2,"content":"So, uh, the overall idea is that ROUGE is actually pretty similar to BLEU."},{"from":1898.1,"to":1900.02,"location":2,"content":"It's based on n-gram overlap."},{"from":1900.02,"to":1905.65,"location":2,"content":"So, some main differences with BLEU are ROUGE doesn't have a brevity penalty."},{"from":1905.65,"to":1907.23,"location":2,"content":"Um, I'll talk more about that in a minute."},{"from":1907.23,"to":1912.19,"location":2,"content":"Uh, the other big one is that ROUGE is based on recall while BLEU is based on precision."},{"from":1912.19,"to":1913.44,"location":2,"content":"So you can see it's there in the title."},{"from":1913.44,"to":1917.12,"location":2,"content":"[NOISE] Um, so, if you think about this a little bit,"},{"from":1917.12,"to":1922.24,"location":2,"content":"I think you can say arguably precision is more important for machine translation."},{"from":1922.24,"to":1929.13,"location":2,"content":"That is, you only want to generate text that appears in one of your reference, uh,"},{"from":1929.13,"to":1932.52,"location":2,"content":"translations, and then to avoid taking"},{"from":1932.52,"to":1934.77,"location":2,"content":"a really conservative strategy where you only generate"},{"from":1934.77,"to":1937.55,"location":2,"content":"really safe things in a really short translation."},{"from":1937.55,"to":1940.04,"location":2,"content":"That's why you add the brevity penalty to make sure"},{"from":1940.04,"to":1943.04,"location":2,"content":"that [NOISE] it tries to write something long enough."},{"from":1943.04,"to":1944.64,"location":2,"content":"And then by contrast,"},{"from":1944.64,"to":1946.29,"location":2,"content":"recall is more important for"},{"from":1946.29,"to":1950.27,"location":2,"content":"summarization because you want to include all the information,"},{"from":1950.27,"to":1953.19,"location":2,"content":"the info- the important information in your summary, right?"},{"from":1953.19,"to":1956.49,"location":2,"content":"So the information that's in the reference summary is,"},{"from":1956.49,"to":1958.08,"location":2,"content":"uh, assumed to be the important information."},{"from":1958.08,"to":1960.24,"location":2,"content":"So recall means that you captured all of that."},{"from":1960.24,"to":1962.46,"location":2,"content":"Um, and I suppose i- if you assume that you have"},{"from":1962.46,"to":1965.04,"location":2,"content":"a maximum length constraint for your summarization system,"},{"from":1965.04,"to":1967.95,"location":2,"content":"then those two kind of give a trade-off, right?"},{"from":1967.95,"to":1972.72,"location":2,"content":"Where you want to include all the information but you can't be too long as a summary."},{"from":1972.72,"to":1975.43,"location":2,"content":"So I think that's the kind of justification why you have"},{"from":1975.43,"to":1978.15,"location":2,"content":"recall and precision for these two different tasks."},{"from":1978.15,"to":1981.48,"location":2,"content":"However, confusingly, often an F1,"},{"from":1981.48,"to":1983.91,"location":2,"content":"that is combination of precision and recall version of"},{"from":1983.91,"to":1986.94,"location":2,"content":"ROUGE is reported anyway in the summarization literature."},{"from":1986.94,"to":1989.49,"location":2,"content":"And to be honest, I'm not entirely sure why this is, uh,"},{"from":1989.49,"to":1991.14,"location":2,"content":"maybe it's because of the lack of,"},{"from":1991.14,"to":1993.49,"location":2,"content":"uh, explicit max length constraint."},{"from":1993.49,"to":1997.82,"location":2,"content":"Um, anyway, I, I tried to search that but I couldn't find an answer."},{"from":1997.82,"to":2001.1,"location":2,"content":"So here's some more information on ROUGE."},{"from":2001.1,"to":2002.84,"location":2,"content":"Um, if you remember,"},{"from":2002.84,"to":2004.94,"location":2,"content":"BLEU is reported as a single number, right?"},{"from":2004.94,"to":2006.98,"location":2,"content":"BLEU is just a single number and it is"},{"from":2006.98,"to":2010.64,"location":2,"content":"a combination of the precisions for the different n-grams"},{"from":2010.64,"to":2012.95,"location":2,"content":"which is usually 1-4 whereas"},{"from":2012.95,"to":2016.91,"location":2,"content":"ROUGE scores are usually reported separately for each n-gram."},{"from":2016.91,"to":2022.25,"location":2,"content":"So, the most commonly reported ROUGE scores are ROUGE-1, ROUGE-2 and ROUGE-L."},{"from":2022.25,"to":2027.37,"location":2,"content":"So, ROUGE one, not to be confused with Rogue One: A Star Wars Story."},{"from":2027.37,"to":2029.06,"location":2,"content":"Um, I feel like since that film came out,"},{"from":2029.06,"to":2031.61,"location":2,"content":"I see so many people mistyping this, and I think it's related."},{"from":2031.61,"to":2034.73,"location":2,"content":"Um, so, ROUGE-1 is, uh,"},{"from":2034.73,"to":2037.3,"location":2,"content":"based on unigram overlap,"},{"from":2037.3,"to":2041.02,"location":2,"content":"um, [NOISE] and ROUGE-2 based on bigram overlap."},{"from":2041.02,"to":2043.31,"location":2,"content":"It's kind of an analogy to BLEU really except,"},{"from":2043.31,"to":2045.24,"location":2,"content":"uh, recall-based, not precision-based."},{"from":2045.24,"to":2050.45,"location":2,"content":"The more interesting one is ROUGE-L which is longest common subsequence overlap."},{"from":2050.45,"to":2054.59,"location":2,"content":"Um, so, the idea here is that you are interested not only in, uh,"},{"from":2054.59,"to":2056.86,"location":2,"content":"particular n-grams matching up but in,"},{"from":2056.86,"to":2058.31,"location":2,"content":"you know, how many, uh, how,"},{"from":2058.31,"to":2063.52,"location":2,"content":"how long a sequence of words can you find that appear in both."},{"from":2063.52,"to":2066.64,"location":2,"content":"So you can, uh, read more about these metrics"},{"from":2066.64,"to":2069.07,"location":2,"content":"in the paper that was linked on the previous page."},{"from":2069.07,"to":2071.49,"location":2,"content":"And another really important thing to note is there's [NOISE] now"},{"from":2071.49,"to":2075.2,"location":2,"content":"a convenient Python implementation of ROUGE, and um,"},{"from":2075.2,"to":2078.16,"location":2,"content":"maybe it is not apparent why that's exciting,"},{"from":2078.16,"to":2080.42,"location":2,"content":"but it's actually pretty exciting because for a long time,"},{"from":2080.42,"to":2082.48,"location":2,"content":"there was just this Perl script, um,"},{"from":2082.48,"to":2086.36,"location":2,"content":"that was quite hard to run and quite hard to set up and understand."},{"from":2086.36,"to":2089.44,"location":2,"content":"So um, someone out there has been a hero and has, uh,"},{"from":2089.44,"to":2092.29,"location":2,"content":"implemented a pure Python version of ROUGE and checked that it"},{"from":2092.29,"to":2095.32,"location":2,"content":"really does match up to the Perl script that people were using before."},{"from":2095.32,"to":2098.89,"location":2,"content":"So if any of you are using ROUGE or doing summarization for your projects, uh,"},{"from":2098.89,"to":2100.07,"location":2,"content":"make sure that you, uh,"},{"from":2100.07,"to":2102.53,"location":2,"content":"go use that because it will probably save you some time."},{"from":2102.53,"to":2106.09,"location":2,"content":"[NOISE] Okay."},{"from":2106.09,"to":2108.02,"location":2,"content":"So we're gonna re- return to ROUGE a little bit later."},{"from":2108.02,"to":2110.21,"location":2,"content":"Um, I know that in assignment 4 you thought about"},{"from":2110.21,"to":2112.79,"location":2,"content":"the shortcomings of BLEU as a metric and um,"},{"from":2112.79,"to":2116.55,"location":2,"content":"for sure ROUGE has some short- shortcomings as well as a metric for summarization."},{"from":2116.55,"to":2119.08,"location":2,"content":"Um, we're gonna come back to that later."},{"from":2119.08,"to":2123.23,"location":2,"content":"Okay. So, we're gonna move on to neural approaches for summarization."},{"from":2123.23,"to":2127.97,"location":2,"content":"[NOISE] So uh, going back to 2015,"},{"from":2127.97,"to":2130.31,"location":2,"content":"I don't have another dramatic reenactment, I'm afraid."},{"from":2130.31,"to":2132.71,"location":2,"content":"[NOISE] Um, Rush et al."},{"from":2132.71,"to":2135.59,"location":2,"content":"published the first seq2seq summarization paper."},{"from":2135.59,"to":2139.07,"location":2,"content":"[NOISE] So uh, they were viewing this as,"},{"from":2139.07,"to":2141.39,"location":2,"content":"you know, NMT has recently been super successful,"},{"from":2141.39,"to":2144.5,"location":2,"content":"why don't we view abstractive summarization as a translation task and"},{"from":2144.5,"to":2148.57,"location":2,"content":"therefore apply standard translation seq2seq methods to it."},{"from":2148.57,"to":2151.91,"location":2,"content":"So that's exactly what they did and they applied,"},{"from":2151.91,"to":2153.5,"location":2,"content":"uh, a standard attention model,"},{"from":2153.5,"to":2158,"location":2,"content":"and then they did a pretty good job at, uh, Gigaword summarization."},{"from":2158,"to":2159.62,"location":2,"content":"That's the one where you're, um,"},{"from":2159.62,"to":2163.13,"location":2,"content":"converting from the first sentence of the news article to the headline."},{"from":2163.13,"to":2165.57,"location":2,"content":"So it's kind of like, uh, sentence compression."},{"from":2165.57,"to":2170.57,"location":2,"content":"So crucially, this is kind of the same order of magnitude of length as NMT, right?"},{"from":2170.57,"to":2173.81,"location":2,"content":"Because NMT is sentence to sentence and this is kind of sentence to sentence,"},{"from":2173.81,"to":2175.8,"location":2,"content":"maybe at most two sentence two sentence."},{"from":2175.8,"to":2178.31,"location":2,"content":"So this works pretty well and you can get pretty decent, um,"},{"from":2178.31,"to":2180.92,"location":2,"content":"headline generation or sentence compression using this kind of method."},{"from":2180.92,"to":2183.51,"location":2,"content":"[NOISE] Okay."},{"from":2183.51,"to":2185.51,"location":2,"content":"So after that, since 2015,"},{"from":2185.51,"to":2189.38,"location":2,"content":"there have been lots more developments in neural abstractive summarization."},{"from":2189.38,"to":2191.43,"location":2,"content":"And you can kind of um,"},{"from":2191.43,"to":2193.86,"location":2,"content":"group together these developments in,"},{"from":2193.86,"to":2195.44,"location":2,"content":"uh, a collection of themes."},{"from":2195.44,"to":2198.02,"location":2,"content":"So one theme is make it easier to copy."},{"from":2198.02,"to":2201.05,"location":2,"content":"Uh, this seems pretty obvious because in summarization, you know,"},{"from":2201.05,"to":2204.03,"location":2,"content":"you're gonna want to copy every, quite a few words and even phrases,"},{"from":2204.03,"to":2206.61,"location":2,"content":"but don't copy too much."},{"from":2206.61,"to":2208.13,"location":2,"content":"Uh, that's the other thing is that if you make it"},{"from":2208.13,"to":2209.63,"location":2,"content":"too easy to copy, then you copy too much."},{"from":2209.63,"to":2212.6,"location":2,"content":"So, then there's other research showing how to prevent too much copying."},{"from":2212.6,"to":2218.14,"location":2,"content":"[NOISE] Uh, the next thing is some kind of hierarchical or multi-level attention."},{"from":2218.14,"to":2219.47,"location":2,"content":"So as I just showed,"},{"from":2219.47,"to":2221.69,"location":2,"content":"the attention has been pretty key to, um,"},{"from":2221.69,"to":2224,"location":2,"content":"abstractive neural summarization so far."},{"from":2224,"to":2225.61,"location":2,"content":"So there's been some work looking at, you know,"},{"from":2225.61,"to":2228.89,"location":2,"content":"can we kind of make this attention work at a more kind of high-level,"},{"from":2228.89,"to":2232.1,"location":2,"content":"low-level cost fine version so"},{"from":2232.1,"to":2236.03,"location":2,"content":"that we can kind of maybe do our selection at the high-level and at low-level."},{"from":2236.03,"to":2238.99,"location":2,"content":"Another thing which is kind of related is having"},{"from":2238.99,"to":2241.7,"location":2,"content":"some more kind of global content selection."},{"from":2241.7,"to":2243.51,"location":2,"content":"So if you remember when we were talking about the,"},{"from":2243.51,"to":2246.02,"location":2,"content":"the pipelines pre-neural summarization,"},{"from":2246.02,"to":2248.43,"location":2,"content":"they had these different content selection algorithms."},{"from":2248.43,"to":2250.25,"location":2,"content":"And I think you can say that,"},{"from":2250.25,"to":2252.11,"location":2,"content":"um, kind of naive attention,"},{"from":2252.11,"to":2254.63,"location":2,"content":"attention seq2seq is not necessarily"},{"from":2254.63,"to":2257.49,"location":2,"content":"the best way to do content selection for summarization,"},{"from":2257.49,"to":2260.89,"location":2,"content":"maybe you want a more kind of global strategy where you choose what's important."},{"from":2260.89,"to":2264.05,"location":2,"content":"It's not so apparent here when you're doing this small-scale summarization,"},{"from":2264.05,"to":2265.43,"location":2,"content":"but if you imagine that you're summarizing"},{"from":2265.43,"to":2268.29,"location":2,"content":"a whole news article and you're choosing which information,"},{"from":2268.29,"to":2270.45,"location":2,"content":"kind of deciding on each decoder step,"},{"from":2270.45,"to":2273.17,"location":2,"content":"what to choose doesn't seem like the most global strategy."},{"from":2273.17,"to":2276.01,"location":2,"content":"Er, what else have we got?"},{"from":2276.01,"to":2279.41,"location":2,"content":"Uh, there's using, uh, Reinforcement Learning to directly maximize"},{"from":2279.41,"to":2281.3,"location":2,"content":"ROUGE or other discrete goals you might"},{"from":2281.3,"to":2283.82,"location":2,"content":"care about such as maybe the length of the summary."},{"from":2283.82,"to":2287.49,"location":2,"content":"Um, and I say discrete here because ROUGE is a non-differentiable,"},{"from":2287.49,"to":2289.64,"location":2,"content":"uh, function of your generated outputs."},{"from":2289.64,"to":2292.16,"location":2,"content":"There's no, you know, easy way to differentiably"},{"from":2292.16,"to":2294.2,"location":2,"content":"learn that during training in the usual way."},{"from":2294.2,"to":2300.17,"location":2,"content":"Uh, my last point on this list is the kind of theme of"},{"from":2300.17,"to":2304.04,"location":2,"content":"resurrecting pre-neural ideas such as those graph algorithms that I mentioned"},{"from":2304.04,"to":2305.96,"location":2,"content":"earlier and working them into"},{"from":2305.96,"to":2312.01,"location":2,"content":"these new seq2seq abstractive neural systems and I'm sure there is more as well."},{"from":2312.01,"to":2314.15,"location":2,"content":"So, I'm gonna show you a few of these, um,"},{"from":2314.15,"to":2317.66,"location":2,"content":"especially because even if you're not particularly interested in summarization,"},{"from":2317.66,"to":2320.93,"location":2,"content":"a lot of the ideas that we're gonna explore here are actually kind of applicable"},{"from":2320.93,"to":2325.3,"location":2,"content":"to other areas of NLG or just other areas of NLP deep learning."},{"from":2325.3,"to":2328.7,"location":2,"content":"So, the first thing on the list is making it easier to copy,"},{"from":2328.7,"to":2330.88,"location":2,"content":"which seems like probably the first thing you want to fix,"},{"from":2330.88,"to":2333.34,"location":2,"content":"if you've just got basic seq2seq with attention."},{"from":2333.34,"to":2335.8,"location":2,"content":"So, um, a copy mechanism,"},{"from":2335.8,"to":2338.9,"location":2,"content":"which can exist outside of summarization."},{"from":2338.9,"to":2343.16,"location":2,"content":"The reason, why you want this is that basic seq2seq with attention,"},{"from":2343.16,"to":2345.59,"location":2,"content":"they're good at writing fluent output, as we know,"},{"from":2345.59,"to":2349.84,"location":2,"content":"but they are pretty bad at copying over details like rare words correctly."},{"from":2349.84,"to":2353.21,"location":2,"content":"So a copy mechanism is just the kind of sensible idea of saying,"},{"from":2353.21,"to":2357.95,"location":2,"content":"um, let's have an explicit mechanism to just copy over words."},{"from":2357.95,"to":2360.14,"location":2,"content":"So for example, you could use"},{"from":2360.14,"to":2365.38,"location":2,"content":"the attention distribution to- to kind of select what you're going to copy."},{"from":2365.38,"to":2368.89,"location":2,"content":"Um, so, if you are allowing both copying"},{"from":2368.89,"to":2372.24,"location":2,"content":"over words and generating words in the usual way with your language model,"},{"from":2372.24,"to":2377.22,"location":2,"content":"then now you've got a kind of hybrid extractive/abstractive approach to summarization."},{"from":2377.22,"to":2380.36,"location":2,"content":"So, there are several papers, which are- which propose"},{"from":2380.36,"to":2383.33,"location":2,"content":"some kind of copy mechanism variants and I think,"},{"from":2383.33,"to":2385.04,"location":2,"content":"the reason why there is multiple is because there's"},{"from":2385.04,"to":2388.73,"location":2,"content":"kind of a few different choices you can make about how to implement this,"},{"from":2388.73,"to":2393.38,"location":2,"content":"and that means that there's a few different versions of how to implement copy mechanism."},{"from":2393.38,"to":2396.16,"location":2,"content":"So, uh, yeah, there are several papers here which you can look at."},{"from":2396.16,"to":2398.69,"location":2,"content":"I'm going to show you a diagram from a paper that um,"},{"from":2398.69,"to":2401.15,"location":2,"content":"I did a few years ago with Chris."},{"from":2401.15,"to":2404.91,"location":2,"content":"So, this is just one example of how you can do a copying mechanism."},{"from":2404.91,"to":2406.51,"location":2,"content":"So, the - the way we did it,"},{"from":2406.51,"to":2408.49,"location":2,"content":"is we said that on each decoder step,"},{"from":2408.49,"to":2411.59,"location":2,"content":"you're going to calculate this probability Pgen and that's"},{"from":2411.59,"to":2415.37,"location":2,"content":"the probability of generating the next word rather than copying it,"},{"from":2415.37,"to":2419.09,"location":2,"content":"and the idea is that this is computed based on your current kind of context,"},{"from":2419.09,"to":2420.93,"location":2,"content":"your current decoder hidden state."},{"from":2420.93,"to":2422.59,"location":2,"content":"So, then once you've done that,"},{"from":2422.59,"to":2424.79,"location":2,"content":"then the idea is you've got your attention distribution as"},{"from":2424.79,"to":2427.28,"location":2,"content":"normal and you've got your kind of output,"},{"from":2427.28,"to":2431.36,"location":2,"content":"you know, generation distribution as normal and you're going to use this Pgen,"},{"from":2431.36,"to":2432.55,"location":2,"content":"which is just a scalar."},{"from":2432.55,"to":2435.05,"location":2,"content":"You can use that to kind of, uh, combine,"},{"from":2435.05,"to":2438.01,"location":2,"content":"mix together these two probability distributions."},{"from":2438.01,"to":2440.12,"location":2,"content":"So, what this equation is telling you,"},{"from":2440.12,"to":2441.41,"location":2,"content":"is that saying that the uh,"},{"from":2441.41,"to":2444.23,"location":2,"content":"final output distribution for uh,"},{"from":2444.23,"to":2445.59,"location":2,"content":"what word is gonna come next,"},{"from":2445.59,"to":2447.08,"location":2,"content":"it's kind of saying, you know,"},{"from":2447.08,"to":2448.68,"location":2,"content":"it is the probability of generating"},{"from":2448.68,"to":2451.89,"location":2,"content":"times your probability distribution of what you would generate"},{"from":2451.89,"to":2454.28,"location":2,"content":"but then also the probability of copying"},{"from":2454.28,"to":2457.22,"location":2,"content":"and then also what you're attending to at that time."},{"from":2457.22,"to":2461.55,"location":2,"content":"So, the, the main thing is, you're using attention as your copying mechanism."},{"from":2461.55,"to":2463.61,"location":2,"content":"So, attention is kind of doing double-duty here."},{"from":2463.61,"to":2467.89,"location":2,"content":"It's both uh, being useful for the generator to,"},{"from":2467.89,"to":2470,"location":2,"content":"you know, uh, maybe choose to rephrase things"},{"from":2470,"to":2472.46,"location":2,"content":"but it is also being useful as a copying mechanism."},{"from":2472.46,"to":2475.43,"location":2,"content":"And I think that's one of the several things that these different papers do differently."},{"from":2475.43,"to":2478.94,"location":2,"content":"I think, I've seen a paper that maybe has like two separate uh,"},{"from":2478.94,"to":2481.68,"location":2,"content":"attention distributions, one for the copying and one for the attending."},{"from":2481.68,"to":2484.46,"location":2,"content":"Um, other choices you can make differently are for example,"},{"from":2484.46,"to":2487.43,"location":2,"content":"D1 Pgen to be this kind of soft thing that's between zero and"},{"from":2487.43,"to":2490.73,"location":2,"content":"one or do you want it to be a hard thing that has to be either zero or one."},{"from":2490.73,"to":2493.97,"location":2,"content":"Um, you can also make decisions about like"},{"from":2493.97,"to":2497,"location":2,"content":"do you want the Pgen to have supervision during training?"},{"from":2497,"to":2500.16,"location":2,"content":"Do you want to kind of annotate your data set saying these things are copied, things,"},{"from":2500.16,"to":2503.54,"location":2,"content":"these things are not, or do you want to just like learn it end-to-end?"},{"from":2503.54,"to":2506.07,"location":2,"content":"So there's multiple ways you can do this and um,"},{"from":2506.07,"to":2510.1,"location":2,"content":"this has now become pretty, pretty standard."},{"from":2510.1,"to":2512.99,"location":2,"content":"Okay, so copy mechanism seems like,"},{"from":2512.99,"to":2515.96,"location":2,"content":"seems like a sensible idea but there's a big problem with them,"},{"from":2515.96,"to":2518.33,"location":2,"content":"which is what I mentioned earlier and that problem is,"},{"from":2518.33,"to":2519.66,"location":2,"content":"that they copy too much."},{"from":2519.66,"to":2523.31,"location":2,"content":"Um, so, when you- when you run these kind of systems on summarization,"},{"from":2523.31,"to":2525.53,"location":2,"content":"you find that they end up copying a lot of"},{"from":2525.53,"to":2528.86,"location":2,"content":"long phrases and sometimes even whole sentences and uh,"},{"from":2528.86,"to":2531.86,"location":2,"content":"unfortunately your dream of having an abstractive summarization system,"},{"from":2531.86,"to":2533.8,"location":2,"content":"isn't going to work out because your, um,"},{"from":2533.8,"to":2536.51,"location":2,"content":"you know, copy augmented seq2seq system has just"},{"from":2536.51,"to":2540.03,"location":2,"content":"collapsed into a mostly extractive system, which is unfortunate."},{"from":2540.03,"to":2542.06,"location":2,"content":"Another problem with these uh,"},{"from":2542.06,"to":2545.16,"location":2,"content":"copy mechanism models is that they are bad at"},{"from":2545.16,"to":2548.6,"location":2,"content":"overall content selection especially if the input document is long,"},{"from":2548.6,"to":2550.25,"location":2,"content":"and this is what I was hinting at earlier."},{"from":2550.25,"to":2553.58,"location":2,"content":"Um, let's suppose, that you are summarizing something that's quite"},{"from":2553.58,"to":2557.09,"location":2,"content":"long like a news article that's hundreds of words long and you,"},{"from":2557.09,"to":2558.99,"location":2,"content":"you want to write a several sentence summary."},{"from":2558.99,"to":2561.57,"location":2,"content":"It doesn't seem like the kind of smartest choice to"},{"from":2561.57,"to":2564.35,"location":2,"content":"on every step of writing your several sentence summary,"},{"from":2564.35,"to":2566.39,"location":2,"content":"but you're choosing again what to attend to,"},{"from":2566.39,"to":2568.32,"location":2,"content":"what to select, what to summarize."},{"from":2568.32,"to":2572.8,"location":2,"content":"It seems better to kind of make a global decision at the beginning and then summarize."},{"from":2572.8,"to":2576.56,"location":2,"content":"So, yeah, the problem is, there's no overall strategy for selecting the contents."},{"from":2576.56,"to":2583.82,"location":2,"content":"So, uh, here's a paper that I found. Nope, not yet."},{"from":2583.82,"to":2588.45,"location":2,"content":"Okay. So, how might you do better content selection for neural summarization?"},{"from":2588.45,"to":2592.01,"location":2,"content":"So, if you remember in this pre-neural summarization we looked at,"},{"from":2592.01,"to":2594.89,"location":2,"content":"you had completely separate stages in the pipeline, right?"},{"from":2594.89,"to":2596.87,"location":2,"content":"You had the content selection stage and you had"},{"from":2596.87,"to":2600.26,"location":2,"content":"a surface realization that is the text generation stage."},{"from":2600.26,"to":2602.75,"location":2,"content":"But in our seq2seq attention systems,"},{"from":2602.75,"to":2605.24,"location":2,"content":"these two stages are just completely mixed together, right?"},{"from":2605.24,"to":2608.78,"location":2,"content":"You're doing your step-by-step surface realization that is text generation,"},{"from":2608.78,"to":2611.74,"location":2,"content":"and then on each of those, you're also doing content selection."},{"from":2611.74,"to":2615.3,"location":2,"content":"So, yeah, this doesn't make sense."},{"from":2615.3,"to":2617.51,"location":2,"content":"So, I found a paper, which is,"},{"from":2617.51,"to":2619.74,"location":2,"content":"uh, published I think last year,"},{"from":2619.74,"to":2622.16,"location":2,"content":"which gives a quite nice kind of"},{"from":2622.16,"to":2627.05,"location":2,"content":"simple solution to this problem and it's called bottom-up summarization."},{"from":2627.05,"to":2631.72,"location":2,"content":"So, in this paper if you look at the- if you look at the figure,"},{"from":2631.72,"to":2633.26,"location":2,"content":"uh, the main idea is pretty simple."},{"from":2633.26,"to":2637.37,"location":2,"content":"It says that, first you're going to have a content selection stage and this is"},{"from":2637.37,"to":2641.99,"location":2,"content":"just uh, thought of as a neural sequence tagging model problem, right?"},{"from":2641.99,"to":2644.66,"location":2,"content":"You run through your source documents and"},{"from":2644.66,"to":2647.61,"location":2,"content":"you kind of tag every word as include or don't include."},{"from":2647.61,"to":2649.79,"location":2,"content":"So, you're just kinda deciding like what seems important,"},{"from":2649.79,"to":2651.68,"location":2,"content":"what seems like it should make it into the summary and what"},{"from":2651.68,"to":2655.63,"location":2,"content":"doesn't and then the bottom-up attention stage says that,"},{"from":2655.63,"to":2658.01,"location":2,"content":"now you'll seq2seq with an attention system,"},{"from":2658.01,"to":2659.95,"location":2,"content":"which is gonna generate the summary."},{"from":2659.95,"to":2661.61,"location":2,"content":"Are you're gonna kind of apply a mask?"},{"from":2661.61,"to":2663.13,"location":2,"content":"You know, apply a hard constraint that says,"},{"from":2663.13,"to":2666.91,"location":2,"content":"that you can't attend to words that were tagged don't-include."},{"from":2666.91,"to":2670.59,"location":2,"content":"So, this turns out to be pretty simple but effective um,"},{"from":2670.59,"to":2674.09,"location":2,"content":"because it's a better overall content selection strategy because by doing"},{"from":2674.09,"to":2678.8,"location":2,"content":"this first content selection stage by sequence-tagging you're kind of just,"},{"from":2678.8,"to":2682.73,"location":2,"content":"just doing the selection thing without also at the same time doing the generation thing,"},{"from":2682.73,"to":2684.8,"location":2,"content":"which I think turns out to be a better way to make"},{"from":2684.8,"to":2687.82,"location":2,"content":"better decisions about what to include and then separately,"},{"from":2687.82,"to":2689.9,"location":2,"content":"this also means as a great side effect,"},{"from":2689.9,"to":2693.5,"location":2,"content":"you have less copying of long sequences in the generation model."},{"from":2693.5,"to":2696.83,"location":2,"content":"Um, because if you are not allowed to attend to things,"},{"from":2696.83,"to":2698.22,"location":2,"content":"which you shouldn't be including,"},{"from":2698.22,"to":2701.96,"location":2,"content":"then it's kind of hard to copy a really long sequence, right?"},{"from":2701.96,"to":2705.32,"location":2,"content":"Like if you want to copy a whole sentence but the sentence has"},{"from":2705.32,"to":2708.98,"location":2,"content":"plenty of don't include words in it,"},{"from":2708.98,"to":2711.64,"location":2,"content":"then you can't really copy a long sequence, you have to break it up."},{"from":2711.64,"to":2712.97,"location":2,"content":"So, what the model ends up doing,"},{"from":2712.97,"to":2714.32,"location":2,"content":"is it kind of has to skip,"},{"from":2714.32,"to":2717.11,"location":2,"content":"skip around the parts that is meant to include and then it's forced to"},{"from":2717.11,"to":2720.65,"location":2,"content":"be more abstractive to put the parts together. Yep."},{"from":2720.65,"to":2725.51,"location":2,"content":"How did they backpropagate the masking decision because it seems like-"},{"from":2725.51,"to":2728.72,"location":2,"content":"Because during training [inaudible] masking decision."},{"from":2728.72,"to":2732.03,"location":2,"content":"Yeah, I think it might be trained separately."},{"from":2732.03,"to":2733.61,"location":2,"content":"I mean, you can go and check the paper."},{"from":2733.61,"to":2735.89,"location":2,"content":"I've, I've read a lot of papers in the last days, I can't quite remember."},{"from":2735.89,"to":2737.99,"location":2,"content":"I think, it might be trained separately but they might"},{"from":2737.99,"to":2740.66,"location":2,"content":"have tried training it together but it didn't work as well."},{"from":2740.66,"to":2743.2,"location":2,"content":"I am not sure. You can check it out."},{"from":2743.2,"to":2748.74,"location":2,"content":"Okay. So, another paper I want to tell you about is a paper which uh,"},{"from":2748.74,"to":2753.97,"location":2,"content":"used reinforcement learning to directly maximize ROUGE for neural summarization."},{"from":2753.97,"to":2756.28,"location":2,"content":"So this was a paper from two years ago."},{"from":2756.28,"to":2758.36,"location":2,"content":"And the main idea is that they can use RL to"},{"from":2758.36,"to":2761.87,"location":2,"content":"directly optimize in this case ROUGE-L, the metric."},{"from":2761.87,"to":2766.01,"location":2,"content":"So by contrast, the standard maximum likelihood of training that is"},{"from":2766.01,"to":2767.84,"location":2,"content":"the training objective we've been talking about for"},{"from":2767.84,"to":2770.39,"location":2,"content":"the whole class so far for language models uh,"},{"from":2770.39,"to":2773.84,"location":2,"content":"that can't directly optimize ROUGE-L because it's a non-differentiable function."},{"from":2773.84,"to":2776.87,"location":2,"content":"So they uh, they use this RL technique"},{"from":2776.87,"to":2781.82,"location":2,"content":"to compute the ROUGE score during training and then uh,"},{"from":2781.82,"to":2786.11,"location":2,"content":"use a reinforcement learning to backprop to the model."},{"from":2786.11,"to":2793.22,"location":2,"content":"So, the interesting finding from this paper is that if they just used the RL objective,"},{"from":2793.22,"to":2796.04,"location":2,"content":"then they do indeed get higher ROUGE scores."},{"from":2796.04,"to":2798.47,"location":2,"content":"So they can successfully optimize"},{"from":2798.47,"to":2800.24,"location":2,"content":"this ROUGE-L metric that they were aiming to"},{"from":2800.24,"to":2802.76,"location":2,"content":"optimize but the problem is that when you do that,"},{"from":2802.76,"to":2804.72,"location":2,"content":"you get lower human judgment scores."},{"from":2804.72,"to":2807.05,"location":2,"content":"So, on the right we're seeing that the RL only model has"},{"from":2807.05,"to":2811.78,"location":2,"content":"actually pretty pretty bad readability relevance human judgment scores."},{"from":2811.78,"to":2817.24,"location":2,"content":"It's worse than just the maximum likelihood supervised training system."},{"from":2817.24,"to":2820.68,"location":2,"content":"So, this is a quote from their blog post that says,"},{"from":2820.68,"to":2822.95,"location":2,"content":"\"We have observed that our models with the highest ROUGE scores"},{"from":2822.95,"to":2825.34,"location":2,"content":"also generated barely readable summaries.\""},{"from":2825.34,"to":2826.76,"location":2,"content":"So, this is- this is,"},{"from":2826.76,"to":2828.14,"location":2,"content":"um, I suppose a problem, right?"},{"from":2828.14,"to":2831.17,"location":2,"content":"If you try to directly optimize for the metric,"},{"from":2831.17,"to":2833.45,"location":2,"content":"then you might start finding that you're kind of gaming"},{"from":2833.45,"to":2836.68,"location":2,"content":"the metric and not optimizing for the true task, right,"},{"from":2836.68,"to":2840.55,"location":2,"content":"because we know, just as we know that BLEU was not really a perfect analogy to"},{"from":2840.55,"to":2842.53,"location":2,"content":"actual translation quality so is ROUGE"},{"from":2842.53,"to":2846.26,"location":2,"content":"not a perfect analogy to uh, summarization quality."},{"from":2846.26,"to":2848.66,"location":2,"content":"But they did do something cool, which is that they found that if"},{"from":2848.66,"to":2851.42,"location":2,"content":"you combine the two objectives,"},{"from":2851.42,"to":2853.03,"location":2,"content":"so they kind of, uh, you know,"},{"from":2853.03,"to":2856.91,"location":2,"content":"predict the language model sequence objective and then they also like produce"},{"from":2856.91,"to":2861.3,"location":2,"content":"an overall summary that gets a high ROUGE score objective and you combine them together,"},{"from":2861.3,"to":2865.37,"location":2,"content":"then you can get a better human uh, judgment score,"},{"from":2865.37,"to":2868.22,"location":2,"content":"which in the end is the closest thing we have to uh,"},{"from":2868.22,"to":2869.93,"location":2,"content":"a measure of actual summarization quality."},{"from":2869.93,"to":2874.34,"location":2,"content":"[NOISE] Okay."},{"from":2874.34,"to":2877.28,"location":2,"content":"So, I'm gonna move on to uh, dialogue,"},{"from":2877.28,"to":2881.75,"location":2,"content":"which is um, a different NLG, kind of family of tasks."},{"from":2881.75,"to":2885.59,"location":2,"content":"Uh, so, really dialogue encompasses a really large variety of settings."},{"from":2885.59,"to":2886.7,"location":2,"content":"And we are not going to cover them all,"},{"from":2886.7,"to":2888.8,"location":2,"content":"but here is a kind of overview of all the different kinds"},{"from":2888.8,"to":2891.18,"location":2,"content":"of tasks that people might mean, when they say dialogue."},{"from":2891.18,"to":2895.49,"location":2,"content":"Um, so, there's task-oriented dialogue and this kind of refers to any setting,"},{"from":2895.49,"to":2898.2,"location":2,"content":"where you're trying to kind of get something done in the conversation."},{"from":2898.2,"to":2899.69,"location":2,"content":"So, if for example, you've got kind of"},{"from":2899.69,"to":2903.59,"location":2,"content":"assistive tasks where it's assumed that you have, you know, maybe the uh,"},{"from":2903.59,"to":2907.04,"location":2,"content":"the dialogue agent is trying to help a human user to do"},{"from":2907.04,"to":2910.7,"location":2,"content":"something like maybe giving customer service or recommendations,"},{"from":2910.7,"to":2912.89,"location":2,"content":"answering questions, helping a user,"},{"from":2912.89,"to":2915.95,"location":2,"content":"you know, accomplish a task like buying or booking something."},{"from":2915.95,"to":2918.35,"location":2,"content":"Uh, these are the kinds of tasks, which the virtual systems on"},{"from":2918.35,"to":2921.74,"location":2,"content":"your phone can do or can kind of do."},{"from":2921.74,"to":2926.59,"location":2,"content":"Um, another family of task-oriented dialogue tasks are cooperative tasks."},{"from":2926.59,"to":2929.15,"location":2,"content":"So, this is kind of anything where you've got two agents who are"},{"from":2929.15,"to":2932.12,"location":2,"content":"trying to solve a task together via dialogue."},{"from":2932.12,"to":2934.72,"location":2,"content":"Um, and the opposite of that would be adversarial."},{"from":2934.72,"to":2938.6,"location":2,"content":"So anything where you have two agents who are trying to compete in a task and that uh,"},{"from":2938.6,"to":2942.34,"location":2,"content":"competition is conducted through dialogue."},{"from":2942.34,"to":2948.95,"location":2,"content":"[NOISE] So uh, the opposite to task-oriented dialogue is, uh, social dialogue."},{"from":2948.95,"to":2953.6,"location":2,"content":"So that's something where there is no explicit task other than to, I suppose socialize."},{"from":2953.6,"to":2956.11,"location":2,"content":"So chit-chat dialogue, um,"},{"from":2956.11,"to":2960.2,"location":2,"content":"is just dialogue where you're just doing it for social fun or for company."},{"from":2960.2,"to":2964.91,"location":2,"content":"Um, I've also seen some work on kind of like therapy or mental well-being dialogue,"},{"from":2964.91,"to":2966.74,"location":2,"content":"I'm not sure if this should go in task or social,"},{"from":2966.74,"to":2968.11,"location":2,"content":"it's kind of a mix, uh,"},{"from":2968.11,"to":2970.58,"location":2,"content":"but I suppose these are the ones where the goal is to"},{"from":2970.58,"to":2974.28,"location":2,"content":"maybe offer kind of emotional support to the human user."},{"from":2974.28,"to":2980.03,"location":2,"content":"Um, so as a very kind of brief overview of how,"},{"from":2980.03,"to":2982.07,"location":2,"content":"uh, the deep learning, uh,"},{"from":2982.07,"to":2985.07,"location":2,"content":"renaissance has kind of changed dialog research, um,"},{"from":2985.07,"to":2988.59,"location":2,"content":"I think you can say that in kind of pre-deep learning,"},{"from":2988.59,"to":2990.62,"location":2,"content":"uh, the difficulty of open-ended,"},{"from":2990.62,"to":2993.83,"location":2,"content":"free-form natural language generation, meant that, uh,"},{"from":2993.83,"to":2995.41,"location":2,"content":"dialogue systems were often,"},{"from":2995.41,"to":2998.36,"location":2,"content":"uh, not doing free-form NLG."},{"from":2998.36,"to":3000.73,"location":2,"content":"They might use predefined templates meaning that you have"},{"from":3000.73,"to":3003.78,"location":2,"content":"a template where you just fill in some slots with the content, uh,"},{"from":3003.78,"to":3006.7,"location":2,"content":"or maybe you retrieve an appropriate response from"},{"from":3006.7,"to":3009.63,"location":2,"content":"a corpus of responses that you have in order to find,"},{"from":3009.63,"to":3011.38,"location":2,"content":"you know, an appropriate response for the user."},{"from":3011.38,"to":3013.33,"location":2,"content":"And these are by no means simple systems,"},{"from":3013.33,"to":3016.18,"location":2,"content":"they had some very complex things going on like deciding, you know,"},{"from":3016.18,"to":3019.57,"location":2,"content":"what their dialogue state is and what template you should use and so on and the-"},{"from":3019.57,"to":3023.91,"location":2,"content":"all the natural language understanding components of understanding the context so far."},{"from":3023.91,"to":3026.45,"location":2,"content":"But, uh, one effect that,"},{"from":3026.45,"to":3028.82,"location":2,"content":"that deep learning had is that, uh,"},{"from":3028.82,"to":3031.91,"location":2,"content":"since again kind of 2015 which is when NMT, uh,"},{"from":3031.91,"to":3034.61,"location":2,"content":"became standard, there's been, uh,"},{"from":3034.61,"to":3038.44,"location":2,"content":"just like summarization, lots of papers applying seq2seq methods to dialogue."},{"from":3038.44,"to":3043.43,"location":2,"content":"And this has kind of led to a renewed interest in open-ended, free-form dialogue systems."},{"from":3043.43,"to":3045.76,"location":2,"content":"So uh, if you wanna have a look at what did"},{"from":3045.76,"to":3048.13,"location":2,"content":"those early seq2seq dialogue papers look like,"},{"from":3048.13,"to":3055.53,"location":2,"content":"um, here's two kind of early ones like maybe the first ones to apply seq2seq."},{"from":3055.53,"to":3060.4,"location":2,"content":"Okay. So uh, people quickly applied seq2seq, uh,"},{"from":3060.4,"to":3063.16,"location":2,"content":"NMT methods to dialogue but it quickly became"},{"from":3063.16,"to":3066.13,"location":2,"content":"very apparent that this kind of naive application of"},{"from":3066.13,"to":3068.56,"location":2,"content":"standard NMT methods has"},{"from":3068.56,"to":3073.91,"location":2,"content":"some serious pervasive deficiencies when applied to a task like chitchat dialogue."},{"from":3073.91,"to":3076.96,"location":2,"content":"And this is even more true than it was for summarization."},{"from":3076.96,"to":3081.14,"location":2,"content":"So what are some examples of these serious pervas- pervasive deficiencies?"},{"from":3081.14,"to":3084.43,"location":2,"content":"Uh, one would be genericness or boring responses,"},{"from":3084.43,"to":3086.71,"location":2,"content":"and I'll go into more detail about these in a moment."},{"from":3086.71,"to":3089.01,"location":2,"content":"Another one is irrelevant responses."},{"from":3089.01,"to":3090.18,"location":2,"content":"So that's when, uh,"},{"from":3090.18,"to":3092.2,"location":2,"content":"the dialogue agent kind of says something back"},{"from":3092.2,"to":3095.07,"location":2,"content":"that's just kind of unrelated to what the user says."},{"from":3095.07,"to":3096.7,"location":2,"content":"Um, another one is repetition,"},{"from":3096.7,"to":3098.08,"location":2,"content":"this is pretty basic but it,"},{"from":3098.08,"to":3099.64,"location":2,"content":"uh, it happens a lot."},{"from":3099.64,"to":3104.28,"location":2,"content":"Um, so that's also repetition within the utterance and maybe repetition across utterances."},{"from":3104.28,"to":3107.49,"location":2,"content":"Ah, another difficulty is,"},{"from":3107.49,"to":3108.91,"location":2,"content":"uh, kind of lack of context,"},{"from":3108.91,"to":3110.8,"location":2,"content":"like not remembering the conversation history."},{"from":3110.8,"to":3113.71,"location":2,"content":"Obviously, if you do not condition on the whole conversation history,"},{"from":3113.71,"to":3117.19,"location":2,"content":"there's no way your dialogue agent can use it but it is a challenge especially if you"},{"from":3117.19,"to":3121.32,"location":2,"content":"have a very long dialogue history to figure out how to condition on it effectively."},{"from":3121.32,"to":3124.06,"location":2,"content":"Another problem is the lack of consistent persona."},{"from":3124.06,"to":3125.38,"location":2,"content":"So if you kind of, uh,"},{"from":3125.38,"to":3129.97,"location":2,"content":"naively as in maybe those two papers that I referenced on the previous slide,"},{"from":3129.97,"to":3134.39,"location":2,"content":"if you naively train a kind of standard seq2seq model to maybe take the, uh,"},{"from":3134.39,"to":3136.48,"location":2,"content":"you know the user's last utterance and then say something back,"},{"from":3136.48,"to":3138.95,"location":2,"content":"or maybe even the whole dialogue history and say something back."},{"from":3138.95,"to":3142.68,"location":2,"content":"Often your dialogue agent will have this completely inconsistent persona,"},{"from":3142.68,"to":3146.8,"location":2,"content":"like one moment they will say that it lives in Europe and then it'll say it lives in,"},{"from":3146.8,"to":3149.77,"location":2,"content":"I don't know, China or something and it just doesn't make sense."},{"from":3149.77,"to":3151.91,"location":2,"content":"So I'm gonna go through, uh,"},{"from":3151.91,"to":3154.81,"location":2,"content":"some of these problems and give you a bit more detail on them."},{"from":3154.81,"to":3157.87,"location":2,"content":"So first, this irrelevant response problem."},{"from":3157.87,"to":3160.96,"location":2,"content":"So in a bit more detail, your problem is that seq2seq often"},{"from":3160.96,"to":3164.08,"location":2,"content":"generates some response that's kind of unrelated to the user's utterance."},{"from":3164.08,"to":3167.16,"location":2,"content":"So it can be unrelated because it's simply generic,"},{"from":3167.16,"to":3169.15,"location":2,"content":"which means that this is kind of like an overlap with"},{"from":3169.15,"to":3171.61,"location":2,"content":"a generic response problem or it can be"},{"from":3171.61,"to":3174.16,"location":2,"content":"kind of unrelated because the model's choosing to kind of change,"},{"from":3174.16,"to":3176.84,"location":2,"content":"to change the subject to something unrelated."},{"from":3176.84,"to":3179.02,"location":2,"content":"So one solution of many, there,"},{"from":3179.02,"to":3180.88,"location":2,"content":"there are a lot of different papers which, uh,"},{"from":3180.88,"to":3184.74,"location":2,"content":"kind of attack this irrelevant response problem, uh, but just one,"},{"from":3184.74,"to":3187.01,"location":2,"content":"one for example is, uh,"},{"from":3187.01,"to":3189.84,"location":2,"content":"that you should tr- change the training objective."},{"from":3189.84,"to":3192.76,"location":2,"content":"So instead of trying to optimize, um,"},{"from":3192.76,"to":3195.52,"location":2,"content":"mapping from input S to response T such that"},{"from":3195.52,"to":3198.63,"location":2,"content":"you're maximizing the conditional probability of T given S,"},{"from":3198.63,"to":3202.43,"location":2,"content":"instead you should maximize the maximum mutual information."},{"from":3202.43,"to":3204.24,"location":2,"content":"So that's why this is here."},{"from":3204.24,"to":3206.53,"location":2,"content":"So maximum mutual information, uh,"},{"from":3206.53,"to":3209.14,"location":2,"content":"you can kind of rewrite the objective like this,"},{"from":3209.14,"to":3211.91,"location":2,"content":"and if you want to see some more detail you can go look at this paper here."},{"from":3211.91,"to":3215.83,"location":2,"content":"But the idea is that you're trying to find your response T that kind of, uh,"},{"from":3215.83,"to":3218.68,"location":2,"content":"maximizes this thing which is kind of like saying,"},{"from":3218.68,"to":3221.68,"location":2,"content":"it needs to be probable given the inputs but"},{"from":3221.68,"to":3224.92,"location":2,"content":"kind of like as a ratio of its probability in itself."},{"from":3224.92,"to":3229.51,"location":2,"content":"So if T is very high likelihood,"},{"from":3229.51,"to":3232.6,"location":2,"content":"then it gets penalized and it's kind of like about the ratio"},{"from":3232.6,"to":3236.44,"location":2,"content":"of the probability given the input and it's just the stand-alone probability."},{"from":3236.44,"to":3239.65,"location":2,"content":"So the idea is that this is meant to discourage, um,"},{"from":3239.65,"to":3244.24,"location":2,"content":"just saying generic things that just have a high PT by themselves."},{"from":3244.24,"to":3248.99,"location":2,"content":"Um, so that's the irrelevant response problem."},{"from":3248.99,"to":3250.78,"location":2,"content":"And as I just hinted at, there's, uh,"},{"from":3250.78,"to":3252.04,"location":2,"content":"definitely a strong link between"},{"from":3252.04,"to":3256.87,"location":2,"content":"the irrelevant response problem and the kind of generic or boring response problem."},{"from":3256.87,"to":3261.49,"location":2,"content":"So to look at the genericness or boring response problem."},{"from":3261.49,"to":3267.73,"location":2,"content":"[NOISE] So I think"},{"from":3267.73,"to":3272.23,"location":2,"content":"there are some pretty easy fixes that you can make to,"},{"from":3272.23,"to":3275.47,"location":2,"content":"to a degree ameliorate the boring response problem."},{"from":3275.47,"to":3278.41,"location":2,"content":"Whether you're really getting to the heart of the issue is a different question."},{"from":3278.41,"to":3282.31,"location":2,"content":"But some kind of easy test-time fixes that you can certainly do are for example,"},{"from":3282.31,"to":3286.89,"location":2,"content":"you can just directly up-rate, up-weight rare words during beam search."},{"from":3286.89,"to":3289.68,"location":2,"content":"So you can say, all rare words kind of get a boost to their, uh,"},{"from":3289.68,"to":3291.88,"location":2,"content":"log probabilities and then now we're"},{"from":3291.88,"to":3294.22,"location":2,"content":"more likely to produce them during beam search."},{"from":3294.22,"to":3296.41,"location":2,"content":"Another thing you could do is you could use for example,"},{"from":3296.41,"to":3300.53,"location":2,"content":"a sampling decoding algorithm rather than beam search and we talked about that earlier,"},{"from":3300.53,"to":3302.35,"location":2,"content":"um, or you could use, oh yeah,"},{"from":3302.35,"to":3304.2,"location":2,"content":"you could use softmax temperature as well."},{"from":3304.2,"to":3307.11,"location":2,"content":"That's another thing. So those are"},{"from":3307.11,"to":3312.04,"location":2,"content":"kind of test-time fixes and you could regard those as a kind of late intervention, right?"},{"from":3312.04,"to":3316,"location":2,"content":"So an earlier intervention would be maybe training your model differently."},{"from":3316,"to":3320.01,"location":2,"content":"So I'm calling these kind of conditioning fixes because these fixes kind of relate to,"},{"from":3320.01,"to":3323.97,"location":2,"content":"uh, conditioning your model on something that's gonna help it be less boring."},{"from":3323.97,"to":3326.32,"location":2,"content":"So one example is maybe you should condition"},{"from":3326.32,"to":3328.93,"location":2,"content":"the decoder on some kind of additional context."},{"from":3328.93,"to":3331.15,"location":2,"content":"Uh, so for example, there's some work showing that, you know,"},{"from":3331.15,"to":3333.63,"location":2,"content":"if you're doing chitchat dialogue, then maybe you should, uh,"},{"from":3333.63,"to":3336.28,"location":2,"content":"go and sample some related words that are related to"},{"from":3336.28,"to":3338.71,"location":2,"content":"what the user said and then just kind of attend to them when you"},{"from":3338.71,"to":3341.35,"location":2,"content":"generate and then you're more likely to say something that's kind of content"},{"from":3341.35,"to":3344.08,"location":2,"content":"full and interesting compared to the boring things you were saying before."},{"from":3344.08,"to":3346.87,"location":2,"content":"Ah, another option is you could train"},{"from":3346.87,"to":3350.77,"location":2,"content":"a retrieve-and-refine model rather than a generate-from-scratch model."},{"from":3350.77,"to":3353.44,"location":2,"content":"So by retrieve-and-refine, I mean, uh,"},{"from":3353.44,"to":3355.82,"location":2,"content":"you've- supposing you have some kind of corpus of,"},{"from":3355.82,"to":3357.4,"location":2,"content":"of just general kind of utterances,"},{"from":3357.4,"to":3360.46,"location":2,"content":"things that you could say and then maybe you sample one, uh,"},{"from":3360.46,"to":3361.86,"location":2,"content":"from that test set,"},{"from":3361.86,"to":3363.78,"location":2,"content":"th- the training set,"},{"from":3363.78,"to":3366.89,"location":2,"content":"and then you edit it to fit the current situation."},{"from":3366.89,"to":3370.63,"location":2,"content":"So it turns out that this is a pretty strong method to produce"},{"from":3370.63,"to":3374.8,"location":2,"content":"much more kind of diverse and human-like and interesting utterances, um,"},{"from":3374.8,"to":3379.55,"location":2,"content":"because you can get all of that kind of fine grain detail from the sampled,"},{"from":3379.55,"to":3383.66,"location":2,"content":"ah, utterance and then you edit it as necessary to fit your current situation."},{"from":3383.66,"to":3386.74,"location":2,"content":"So I mean, there are downsides to these kinds of methods like maybe it can be"},{"from":3386.74,"to":3390.14,"location":2,"content":"hard to edit it to actually appropriately fit the situation,"},{"from":3390.14,"to":3392.41,"location":2,"content":"um, but it's certainly a way to effectively get like"},{"from":3392.41,"to":3397.17,"location":2,"content":"some more diversity and, um, interest in that."},{"from":3397.17,"to":3400.81,"location":2,"content":"So on the subject of the repetition problem,"},{"from":3400.81,"to":3403.11,"location":2,"content":"that was another kind of major problem we noticed for,"},{"from":3403.11,"to":3405.79,"location":2,"content":"um, applying seq2seq to, uh, chitchat."},{"from":3405.79,"to":3408.97,"location":2,"content":"Um, again, there are kind of simple solutions and more complex solutions."},{"from":3408.97,"to":3412.15,"location":2,"content":"Um, so a simple solution is you could just block repeating"},{"from":3412.15,"to":3415.91,"location":2,"content":"n-grams during beam search and this is usually really quite effective."},{"from":3415.91,"to":3417.59,"location":2,"content":"And what I mean by that is, uh,"},{"from":3417.59,"to":3419.82,"location":2,"content":"during beam search when you're kind of considering,"},{"from":3419.82,"to":3421.51,"location":2,"content":"you know, what are my K hypotheses?"},{"from":3421.51,"to":3425.11,"location":2,"content":"Which is just kind of the top K in the probability distribution, you say,"},{"from":3425.11,"to":3429.53,"location":2,"content":"well, anything that would constitute a repeating n-gram just gets thrown out."},{"from":3429.53,"to":3431.59,"location":2,"content":"So when I say constitutes a repeating n-gram,"},{"from":3431.59,"to":3434.47,"location":2,"content":"I mean if you did take that word,"},{"from":3434.47,"to":3439.63,"location":2,"content":"would you now be creating a repeating let's say two-gram, bigram and, um,"},{"from":3439.63,"to":3443.5,"location":2,"content":"if we're deciding that we're banning all repeating bigrams or trigrams or whatever,"},{"from":3443.5,"to":3446.62,"location":2,"content":"then you essentially just have to check for every possible word that you might"},{"from":3446.62,"to":3450.7,"location":2,"content":"be looking at in beam search and whether that would create a repeating n-gram."},{"from":3450.7,"to":3452.44,"location":2,"content":"So this works pretty well, I mean,"},{"from":3452.44,"to":3454.78,"location":2,"content":"it's by no means a kind of principled solution, right?"},{"from":3454.78,"to":3457.49,"location":2,"content":"If feels like we should kind of have a better way to learn not to repeat, um,"},{"from":3457.49,"to":3459.79,"location":2,"content":"but as a kind of, uh,"},{"from":3459.79,"to":3462.53,"location":2,"content":"effective hack, I think that works, that works pretty well."},{"from":3462.53,"to":3464.83,"location":2,"content":"So the more complex solutions are,"},{"from":3464.83,"to":3467.92,"location":2,"content":"for example, you can train something called coverage mechanism."},{"from":3467.92,"to":3470.53,"location":2,"content":"Um, so in seq2seq, and this is mostly, uh,"},{"from":3470.53,"to":3473.8,"location":2,"content":"inspired by the machine translation setting, uh,"},{"from":3473.8,"to":3476.44,"location":2,"content":"a coverage mechanism is a kind of objective that prevents"},{"from":3476.44,"to":3478.63,"location":2,"content":"the attention mechanism from attending to"},{"from":3478.63,"to":3481.81,"location":2,"content":"the same words multiple times or too many times."},{"from":3481.81,"to":3483.66,"location":2,"content":"And the intuition here is that, uh,"},{"from":3483.66,"to":3486.59,"location":2,"content":"maybe repetition is caused by repeated attention."},{"from":3486.59,"to":3488.62,"location":2,"content":"So if you attend to the same things many times,"},{"from":3488.62,"to":3489.97,"location":2,"content":"then maybe you're gonna repeat,"},{"from":3489.97,"to":3491.61,"location":2,"content":"you know, the same output many times."},{"from":3491.61,"to":3493.69,"location":2,"content":"So if you prevent the repeated attention,"},{"from":3493.69,"to":3495.28,"location":2,"content":"you prevent the repeated output."},{"from":3495.28,"to":3498.19,"location":2,"content":"So this does work pretty well but it's definitely,"},{"from":3498.19,"to":3501.49,"location":2,"content":"um, more of a complex thing to implement,"},{"from":3501.49,"to":3503.64,"location":2,"content":"it's less convenient and,"},{"from":3503.64,"to":3505.12,"location":2,"content":"um, I don't know,"},{"from":3505.12,"to":3508.07,"location":2,"content":"in some settings, it does seem like the simple solution is,"},{"from":3508.07,"to":3509.53,"location":2,"content":"uh, easier and works just as well."},{"from":3509.53,"to":3512.74,"location":2,"content":"Uh, so other complex solutions"},{"from":3512.74,"to":3515.8,"location":2,"content":"might be you could define a training objective to discourage repetition."},{"from":3515.8,"to":3518.32,"location":2,"content":"Uh, this cou- you could try to, um,"},{"from":3518.32,"to":3521.13,"location":2,"content":"define something differentiable but one of the,"},{"from":3521.13,"to":3525.6,"location":2,"content":"the difficulties there is that because you're training with a teacher forcing, right?"},{"from":3525.6,"to":3527.05,"location":2,"content":"Where you're always like looking at the,"},{"from":3527.05,"to":3528.43,"location":2,"content":"the gold inputs so far,"},{"from":3528.43,"to":3530.7,"location":2,"content":"then you never really do the thing where"},{"from":3530.7,"to":3533.01,"location":2,"content":"you generate your own output and start repeating yourself."},{"from":3533.01,"to":3535.84,"location":2,"content":"So it's kind of hard to define the penalty in that situation."},{"from":3535.84,"to":3538.35,"location":2,"content":"So maybe this needs to be a kind of non-differentiable function."},{"from":3538.35,"to":3540.26,"location":2,"content":"So kind of like how,"},{"from":3540.26,"to":3543.74,"location":2,"content":"um, the Paul et al paper was, uh,"},{"from":3543.74,"to":3546.25,"location":2,"content":"optimizing for ROUGE, maybe we kind of, uh,"},{"from":3546.25,"to":3551.45,"location":2,"content":"optimize for not repeating which is a discrete function of the input."},{"from":3551.45,"to":3554.43,"location":2,"content":"Uh, I'm going to skip ahead to storytelling."},{"from":3554.43,"to":3556.2,"location":2,"content":"So in storytelling, uh,"},{"from":3556.2,"to":3559.01,"location":2,"content":"there's a lot of interesting neural storytelling work going on right now."},{"from":3559.01,"to":3562.28,"location":2,"content":"And most of it uses some kind of prompt to write a story."},{"from":3562.28,"to":3564.61,"location":2,"content":"So for example, uh,"},{"from":3564.61,"to":3568.11,"location":2,"content":"writing a story given an image or given a writing prompt"},{"from":3568.11,"to":3572.72,"location":2,"content":"or writing the next sentence of the story given the story so far."},{"from":3572.72,"to":3577.64,"location":2,"content":"So, uh, here's an example of generating a story from an image."},{"from":3577.64,"to":3580.36,"location":2,"content":"And what's interesting here is that we have this image which"},{"from":3580.36,"to":3582.94,"location":2,"content":"is a picture of what appears to be an explosion and"},{"from":3582.94,"to":3584.74,"location":2,"content":"then here you have"},{"from":3584.74,"to":3588.47,"location":2,"content":"a story about the image but written in the style of Taylor Swift lyrics."},{"from":3588.47,"to":3592.01,"location":2,"content":"So it says, you have to be the only light bulb in the night sky I thought,"},{"from":3592.01,"to":3595.22,"location":2,"content":"oh god, it's so dark out of me that I missed you, I promise."},{"from":3595.22,"to":3598.69,"location":2,"content":"And what's interesting here is that there wasn't any straightforward, supervised,"},{"from":3598.69,"to":3602.62,"location":2,"content":"you know, image-captioning data set of explosions and Taylor Swift lyrics."},{"from":3602.62,"to":3605.89,"location":2,"content":"Um, they kind of learned this, uh, separately."},{"from":3605.89,"to":3612.22,"location":2,"content":"So how they did this is that they used a kind of common sentence encoding space."},{"from":3612.22,"to":3615.16,"location":2,"content":"So they used this particular kind of sentence encoding called"},{"from":3615.16,"to":3618.2,"location":2,"content":"skip-thought vectors and then they trained,"},{"from":3618.2,"to":3621.88,"location":2,"content":"um, this COCO image-captioning, uh,"},{"from":3621.88,"to":3624.79,"location":2,"content":"system to go from the image to the encoding of"},{"from":3624.79,"to":3628.11,"location":2,"content":"the sentence and then separately they also trained,"},{"from":3628.11,"to":3630.37,"location":2,"content":"uh, a language model, a conditional language model to go from"},{"from":3630.37,"to":3633.01,"location":2,"content":"the sentence-encoding to the Taylor Swift lyrics."},{"from":3633.01,"to":3635.23,"location":2,"content":"And then because you had this shared encoding space,"},{"from":3635.23,"to":3638.3,"location":2,"content":"you can now put the two together and then go from the picture,"},{"from":3638.3,"to":3641.05,"location":2,"content":"to the embedding, to the Taylor Swift style output,"},{"from":3641.05,"to":3644.22,"location":2,"content":"which I think is pretty, pretty amazing."},{"from":3644.22,"to":3646.6,"location":2,"content":"Wow, I've really lost, lost track of the time."},{"from":3646.6,"to":3648.25,"location":2,"content":"So I, I think I have to hurry up quite a lot."},{"from":3648.25,"to":3655.14,"location":2,"content":"So, um, we've got some really impressive story,"},{"from":3655.14,"to":3657.9,"location":2,"content":"generation systems, recently, um,"},{"from":3657.9,"to":3660.8,"location":2,"content":"and this is an example of,"},{"from":3660.8,"to":3662.58,"location":2,"content":"uh, a system which,"},{"from":3662.58,"to":3663.84,"location":2,"content":"uh, prepares a new data set,"},{"from":3663.84,"to":3665.74,"location":2,"content":"where you write a story given a prompt,"},{"from":3665.74,"to":3668.16,"location":2,"content":"and they made this very impressive,"},{"from":3668.16,"to":3670.9,"location":2,"content":"very beefed-up, uh, convolutional language model,"},{"from":3670.9,"to":3673.97,"location":2,"content":"seq-to-seq system that generates the story given the input."},{"from":3673.97,"to":3675.64,"location":2,"content":"I'm not gonna go through all these details,"},{"from":3675.64,"to":3678.07,"location":2,"content":"but I encourage you if you want to check out, uh,"},{"from":3678.07,"to":3680.95,"location":2,"content":"what's the state of the art in story generation, you should check this out."},{"from":3680.95,"to":3683.11,"location":2,"content":"There's a lot of different interesting things going on with"},{"from":3683.11,"to":3685.78,"location":2,"content":"very fancy attention and convolutions and so on,"},{"from":3685.78,"to":3689.7,"location":2,"content":"and they managed to generate some really interesting, um, impressive stories."},{"from":3689.7,"to":3691.75,"location":2,"content":"So here, if you look at this example,"},{"from":3691.75,"to":3696.28,"location":2,"content":"we've got some really interesting, um, kind of,"},{"from":3696.28,"to":3699.32,"location":2,"content":"uh, story generation that's kind of diverse, it's non-generic,"},{"from":3699.32,"to":3701.32,"location":2,"content":"it's stylistically dramatic which is good,"},{"from":3701.32,"to":3702.93,"location":2,"content":"and is related to the prompts."},{"from":3702.93,"to":3706.33,"location":2,"content":"Um, but I think you can see here kind of the limits of what"},{"from":3706.33,"to":3709.86,"location":2,"content":"the state of the art story generation system can do which is that- um,"},{"from":3709.86,"to":3711.72,"location":2,"content":"although it's kind of in style,"},{"from":3711.72,"to":3714.63,"location":2,"content":"it's mostly kind of atmospheric and descriptive."},{"from":3714.63,"to":3716.14,"location":2,"content":"It's not really moving the plot forward."},{"from":3716.14,"to":3717.94,"location":2,"content":"There's no kind of events here, right?"},{"from":3717.94,"to":3722.3,"location":2,"content":"Um, so the problem is it gets even worse when you generate for longer."},{"from":3722.3,"to":3724.18,"location":2,"content":"When you generate a long, a long text,"},{"from":3724.18,"to":3728.76,"location":2,"content":"then it will mostly just stay on the same idea without moving forward with new ideas."},{"from":3728.76,"to":3731.5,"location":2,"content":"Okay. So I'm gonna skip forward a lot and,"},{"from":3731.5,"to":3733.51,"location":2,"content":"uh, sorry, ought to have planned better."},{"from":3733.51,"to":3735.16,"location":2,"content":"There's a lot of information here which you wanna check"},{"from":3735.16,"to":3737.34,"location":2,"content":"out about poetry generation and other things."},{"from":3737.34,"to":3739.69,"location":2,"content":"I'm going to skip ahead because I want to get to"},{"from":3739.69,"to":3743.32,"location":2,"content":"the NLG evaluation section because that's pretty important."},{"from":3743.32,"to":3747.66,"location":2,"content":"So, um, we've talked about Automatic Evaluation Metrics fr NLG,"},{"from":3747.66,"to":3750.76,"location":2,"content":"and we know that these words overlap based metrics, such as BLEU,"},{"from":3750.76,"to":3752.16,"location":2,"content":"and ROUGE, and METEOR, uh,"},{"from":3752.16,"to":3754.36,"location":2,"content":"we know they're not ideal for machine translation."},{"from":3754.36,"to":3757.78,"location":2,"content":"Ah, they're kind of even worse for summarization mostly"},{"from":3757.78,"to":3761.77,"location":2,"content":"because summarization is even more open-ended than machine translation."},{"from":3761.77,"to":3764.17,"location":2,"content":"And that means that having this kind of rigid notion,"},{"from":3764.17,"to":3765.84,"location":2,"content":"if you've got to match the N-grams,"},{"from":3765.84,"to":3767.38,"location":2,"content":"is even less useful."},{"from":3767.38,"to":3769.93,"location":2,"content":"And then for something even more open-ended like dialogue,"},{"from":3769.93,"to":3771.58,"location":2,"content":"then it's just kind of a disaster."},{"from":3771.58,"to":3774.22,"location":2,"content":"It's not even a metric that gives you a good signal at all,"},{"from":3774.22,"to":3778.05,"location":2,"content":"and this also applies to anything else open-ended, like story generation."},{"from":3778.05,"to":3781.22,"location":2,"content":"So it's been shown, and you can check out the paper at the bottom,"},{"from":3781.22,"to":3785.13,"location":2,"content":"that word overlap metrics are just not a good fit for dialogue."},{"from":3785.13,"to":3787.48,"location":2,"content":"So the orange box is showing you, uh,"},{"from":3787.48,"to":3793.86,"location":2,"content":"some plots of the correlation between human score on a dialog class and BLEU-2,"},{"from":3793.86,"to":3795.41,"location":2,"content":"some variation of BLEU."},{"from":3795.41,"to":3798.49,"location":2,"content":"And the prob- the problem here is you're not seeing much of a correlation at all, right?"},{"from":3798.49,"to":3801.42,"location":2,"content":"It seems that particularly on this dialogue setting, ah,"},{"from":3801.42,"to":3803.37,"location":2,"content":"the correlation between the BLEU metric and"},{"from":3803.37,"to":3806.57,"location":2,"content":"the human judgment of whether it's a good dialogue response is,"},{"from":3806.57,"to":3808.02,"location":2,"content":"uh, the correlation is- I mean,"},{"from":3808.02,"to":3809.04,"location":2,"content":"it looks kind of non-existent."},{"from":3809.04,"to":3810.72,"location":2,"content":"It's at least very weak."},{"from":3810.72,"to":3815.12,"location":2,"content":"So that's pretty unfortunate and there's some other papers that show much the same thing."},{"from":3815.12,"to":3816.64,"location":2,"content":"So you might think, \"Well,"},{"from":3816.64,"to":3818.92,"location":2,"content":"what other automatic metrics can we use?"},{"from":3818.92,"to":3820.6,"location":2,"content":"\"What about perplexity?"},{"from":3820.6,"to":3825.82,"location":2,"content":"Um, so perplexity certainly captures how powerful your language model is,"},{"from":3825.82,"to":3828.09,"location":2,"content":"but it doesn't tell you anything about generation."},{"from":3828.09,"to":3831.97,"location":2,"content":"So for example, if your deca- decoding algorithm is bad in some way,"},{"from":3831.97,"to":3834.7,"location":2,"content":"then perplexity is not gonna tell you anything about that, right?"},{"from":3834.7,"to":3837.64,"location":2,"content":"Because decoding is something you apply to your trained language model."},{"from":3837.64,"to":3840.13,"location":2,"content":"Perplexity can tell if you've got a strong language model or not,"},{"from":3840.13,"to":3841.84,"location":2,"content":"but it's not gonna tell you, um,"},{"from":3841.84,"to":3844.16,"location":2,"content":"necessarily how good your generation is."},{"from":3844.16,"to":3847.33,"location":2,"content":"So some other thoughts you might have about automatic evaluation are,"},{"from":3847.33,"to":3849.46,"location":2,"content":"well, what about word embedding based metrics?"},{"from":3849.46,"to":3852.14,"location":2,"content":"Uh, so the main idea with word embedding based metrics,"},{"from":3852.14,"to":3854.51,"location":2,"content":"uh, you want to compute the similarity of the,"},{"from":3854.51,"to":3858.22,"location":2,"content":"the word embeddings or maybe the average of the word embeddings across a sentence,"},{"from":3858.22,"to":3860.53,"location":2,"content":"not just the overlap of the words themselves."},{"from":3860.53,"to":3862.78,"location":2,"content":"Um, so the idea is that rather than just being"},{"from":3862.78,"to":3865.51,"location":2,"content":"very strict and saying only the exact same word counts,"},{"from":3865.51,"to":3868.89,"location":2,"content":"you say, \"Well, if the words are similar and in word embedding space, then they count.\""},{"from":3868.89,"to":3871.9,"location":2,"content":"So this is certainly more flexible, but unfortunately, uh,"},{"from":3871.9,"to":3874.36,"location":2,"content":"the same paper I showed before shows that this doesn't"},{"from":3874.36,"to":3877.32,"location":2,"content":"correlate well either with human judgments of quality,"},{"from":3877.32,"to":3879.95,"location":2,"content":"at least for the- the dialogue task they are looking at."},{"from":3879.95,"to":3883.28,"location":2,"content":"So here, the middle column is showing the correlation between human,"},{"from":3883.28,"to":3887.51,"location":2,"content":"judgments, and some kind of average of word embedding based metric."},{"from":3887.51,"to":3889.54,"location":2,"content":"So, um, yeah, that doesn't look great either,"},{"from":3889.54,"to":3891.4,"location":2,"content":"not a great correlation."},{"from":3891.4,"to":3894.97,"location":2,"content":"So if we have no automatic metrics to adequately"},{"from":3894.97,"to":3898.43,"location":2,"content":"capture overall quality for natural language generation,"},{"from":3898.43,"to":3900.46,"location":2,"content":"um, what, what can we do instead?"},{"from":3900.46,"to":3902.59,"location":2,"content":"So I think often the strategy is,"},{"from":3902.59,"to":3906.28,"location":2,"content":"you end up defining some more kind of focused automatic metrics to"},{"from":3906.28,"to":3910.7,"location":2,"content":"capture the particular aspects of the generated text that you might be interested in."},{"from":3910.7,"to":3913.64,"location":2,"content":"Um, so for example, you might be interested in, uh, fluency,"},{"from":3913.64,"to":3915.54,"location":2,"content":"and you can compute that by just kind of running"},{"from":3915.54,"to":3918.73,"location":2,"content":"a well-trained language model over your text and generating the probability,"},{"from":3918.73,"to":3923.51,"location":2,"content":"and that's kind of a proxy for how well it's written, you know, good, fluent, grammatical text."},{"from":3923.51,"to":3928,"location":2,"content":"Um, if you're particularly interested in maybe generating text in a particular style,"},{"from":3928,"to":3929.86,"location":2,"content":"then you could ta- take a language model that's"},{"from":3929.86,"to":3932.18,"location":2,"content":"trained on the corpus representing that style,"},{"from":3932.18,"to":3935.11,"location":2,"content":"and now the probability tells you not only is it a good text,"},{"from":3935.11,"to":3936.68,"location":2,"content":"but is it in the right style."},{"from":3936.68,"to":3938.88,"location":2,"content":"Um, there are some other things as well that are like,"},{"from":3938.88,"to":3941.18,"location":2,"content":"you know, diversity, um,"},{"from":3941.18,"to":3943.9,"location":2,"content":"and you can can that pretty easily by just having some statistics about,"},{"from":3943.9,"to":3945.94,"location":2,"content":"you know, how much you're using rare words."},{"from":3945.94,"to":3948.25,"location":2,"content":"Um, relevance to input,"},{"from":3948.25,"to":3950.71,"location":2,"content":"you can kind of compute a similarity score with the input,"},{"from":3950.71,"to":3952.55,"location":2,"content":"and there are just some simple things like, you know,"},{"from":3952.55,"to":3955.48,"location":2,"content":"length and repetition that you surely can count, and yes,"},{"from":3955.48,"to":3958.07,"location":2,"content":"it doesn't tell you overall the overall quality,"},{"from":3958.07,"to":3960.53,"location":2,"content":"but these things are worth measuring."},{"from":3960.53,"to":3962.41,"location":2,"content":"So I think my main point is that yes,"},{"from":3962.41,"to":3964.96,"location":2,"content":"we have a really difficult situation with NLG evaluation."},{"from":3964.96,"to":3966.4,"location":2,"content":"There's no kind of overall metric."},{"from":3966.4,"to":3968.86,"location":2,"content":"Often, they capture this overall quality."},{"from":3968.86,"to":3971.36,"location":2,"content":"Um, but if you measure lots of these things,"},{"from":3971.36,"to":3976.07,"location":2,"content":"then they certainly can help you track some important things that you should know."},{"from":3976.07,"to":3981.89,"location":2,"content":"So we talked about how automatic evaluation metrics for NLG are really tough."},{"from":3981.89,"to":3983.71,"location":2,"content":"So let's talk about human evaluation."},{"from":3983.71,"to":3987.4,"location":2,"content":"Uh, human judgments are regarded as the gold standard, right?"},{"from":3987.4,"to":3990.86,"location":2,"content":"But we already know that human evaluation is slow and expensive,"},{"from":3990.86,"to":3994.16,"location":2,"content":"uh, but are those the only problems with human eval?"},{"from":3994.16,"to":3996.91,"location":2,"content":"Let's suppose that you do have access, uh, to,"},{"from":3996.91,"to":4000.06,"location":2,"content":"let's say, the time or money you need to do human evaluations."},{"from":4000.06,"to":4002.11,"location":2,"content":"Um, does that solve all your problems?"},{"from":4002.11,"to":4003.48,"location":2,"content":"Suppose you have unlimited human eval,"},{"from":4003.48,"to":4004.98,"location":2,"content":"does that actually solve your problems?"},{"from":4004.98,"to":4007.59,"location":2,"content":"And my answer is, uh, no."},{"from":4007.59,"to":4009.63,"location":2,"content":"And this is kinda from personal experience."},{"from":4009.63,"to":4013.59,"location":2,"content":"Um, conducting human evaluation in itself is very difficult to get right."},{"from":4013.59,"to":4017.28,"location":2,"content":"It's not easy at all, and this is partially because humans do a lot of weird things."},{"from":4017.28,"to":4019.91,"location":2,"content":"Humans, uh, unlike a metric, uh,"},{"from":4019.91,"to":4022.13,"location":2,"content":"an automatic metric, they're inconsistent,"},{"from":4022.13,"to":4023.67,"location":2,"content":"they could be illogical."},{"from":4023.67,"to":4025.29,"location":2,"content":"Sometimes, they just get bored of your task,"},{"from":4025.29,"to":4026.76,"location":2,"content":"and they don't really pay attention anymore."},{"from":4026.76,"to":4029.58,"location":2,"content":"Uh, they can misinterpret the question you asked,"},{"from":4029.58,"to":4032.4,"location":2,"content":"and sometimes they do things they can't really explain why they did it."},{"from":4032.4,"to":4034.44,"location":2,"content":"So, um, as a kind of case study of"},{"from":4034.44,"to":4036.54,"location":2,"content":"this I'm going to tell you about, um,"},{"from":4036.54,"to":4038.01,"location":2,"content":"a project I did where I was,"},{"from":4038.01,"to":4039.54,"location":2,"content":"uh, building some chatbots,"},{"from":4039.54,"to":4043.49,"location":2,"content":"and it turned out that the human evaluation was kind of the hardest part of the project."},{"from":4043.49,"to":4046.23,"location":2,"content":"So I was trying to build these chatbots for the Persona-Chat data"},{"from":4046.23,"to":4049.64,"location":2,"content":"set and in particular investigating controllability."},{"from":4049.64,"to":4052.88,"location":2,"content":"So we're trying to control aspects of the generated texts such as, you know,"},{"from":4052.88,"to":4054.05,"location":2,"content":"whether you repeat itself,"},{"from":4054.05,"to":4055.39,"location":2,"content":"how generic you are,"},{"from":4055.39,"to":4057.61,"location":2,"content":"kind of these same problems that we noted before."},{"from":4057.61,"to":4060.18,"location":2,"content":"So we built these models that control, you know,"},{"from":4060.18,"to":4062.09,"location":2,"content":"specificity of what we're saying and"},{"from":4062.09,"to":4064.74,"location":2,"content":"how related what we're saying is to what the user said."},{"from":4064.74,"to":4066.09,"location":2,"content":"So here you can see that,"},{"from":4066.09,"to":4068.88,"location":2,"content":"you know, uh, our partner said something like, \"Yes,"},{"from":4068.88,"to":4071.74,"location":2,"content":"I'm studying law at the moment,\" and we can kind of control-"},{"from":4071.74,"to":4074.72,"location":2,"content":"turn this control knob that makes us say something very generic like,"},{"from":4074.72,"to":4077.01,"location":2,"content":"\"Oh,\" and then like 20 dots or something"},{"from":4077.01,"to":4079.47,"location":2,"content":"just completely bonkers that's just all the rare words you know."},{"from":4079.47,"to":4081.51,"location":2,"content":"And there's like a sweet- a sweet spot between what you say,"},{"from":4081.51,"to":4083.95,"location":2,"content":"\"That sounds like a lot of fun. How long have you been studying?\""},{"from":4083.95,"to":4086.95,"location":2,"content":"And then similarly, we have a knob we can turn to,"},{"from":4086.95,"to":4091.26,"location":2,"content":"uh, determine how semantically related what we say is to what, what they said."},{"from":4091.26,"to":4093.54,"location":2,"content":"So, um, you know, that's kind of interesting."},{"from":4093.54,"to":4096.61,"location":2,"content":"It's, it's a way to control the output of the, uh, NLG system."},{"from":4096.61,"to":4099.52,"location":2,"content":"But actually, I want to tell you about how the human evaluation was so difficult,"},{"from":4099.52,"to":4102.89,"location":2,"content":"so we have these systems that we wanted to generate using human eval."},{"from":4102.89,"to":4106.23,"location":2,"content":"So the question is, how do you ask for the human quality judgments here?"},{"from":4106.23,"to":4109.8,"location":2,"content":"Uh, you can ask kind of simple overall quality questions,"},{"from":4109.8,"to":4111.98,"location":2,"content":"like, you know, how well does the conversation go?"},{"from":4111.98,"to":4113.67,"location":2,"content":"Was- was the user engaging?"},{"from":4113.67,"to":4114.99,"location":2,"content":"Um, or maybe comparative,"},{"from":4114.99,"to":4118.6,"location":2,"content":"Which of these users gave the best response? Uh, questions like this."},{"from":4118.6,"to":4120.33,"location":2,"content":"And, you know, we tried a lot of them,"},{"from":4120.33,"to":4123,"location":2,"content":"but there were just major problems with all of them."},{"from":4123,"to":4126.96,"location":2,"content":"Like, these questions are necessarily very subjective and also,"},{"from":4126.96,"to":4129.15,"location":2,"content":"the different respondents have different expectations,"},{"from":4129.15,"to":4130.62,"location":2,"content":"and this affects their judgments."},{"from":4130.62,"to":4133.69,"location":2,"content":"So for example, if you ask, do you think this user is a human or a bot?"},{"from":4133.69,"to":4135.65,"location":2,"content":"Then, well, that depends entirely on"},{"from":4135.65,"to":4140.22,"location":2,"content":"this respondents' knowledge of bots or opinion of bots and what they think they can do."},{"from":4140.22,"to":4143.94,"location":2,"content":"Another example is, you'd have kind of catastrophic misunderstanding of the question."},{"from":4143.94,"to":4145.14,"location":2,"content":"So for example, if we ask,"},{"from":4145.14,"to":4147.68,"location":2,"content":"was this user- was this chatbot engaging?"},{"from":4147.68,"to":4149.48,"location":2,"content":"Then someone responded saying, \"Yup,"},{"from":4149.48,"to":4151.02,"location":2,"content":"it was engaging because it always wrote back\","},{"from":4151.02,"to":4152.31,"location":2,"content":"which clearly isn't what we meant."},{"from":4152.31,"to":4154.6,"location":2,"content":"We meant like are they an engaging conversation partner,"},{"from":4154.6,"to":4156.77,"location":2,"content":"but they took a very literal assumption,"},{"from":4156.77,"to":4159.07,"location":2,"content":"uh, of, of what engaging means."},{"from":4159.07,"to":4162.98,"location":2,"content":"So the problem here is that overall quality depends on many underlying factors,"},{"from":4162.98,"to":4165.39,"location":2,"content":"and it's pretty hard to kind of find a single,"},{"from":4165.39,"to":4168.72,"location":2,"content":"overall question that captures just overall quality."},{"from":4168.72,"to":4171.03,"location":2,"content":"So we ended up doing this, we ended up breaking this down"},{"from":4171.03,"to":4174.27,"location":2,"content":"into lots more kind of factors of quality."},{"from":4174.27,"to":4176.2,"location":2,"content":"So, uh, the way we saw it is that,"},{"from":4176.2,"to":4179.82,"location":2,"content":"you have maybe these kind of overall measures of quality of the chatbot,"},{"from":4179.82,"to":4181.38,"location":2,"content":"such as how engaging was it,"},{"from":4181.38,"to":4183.14,"location":2,"content":"how enjoyable was it to talk to,"},{"from":4183.14,"to":4185.69,"location":2,"content":"and kind of maybe how convincing was it that it was human."},{"from":4185.69,"to":4187.23,"location":2,"content":"And then below those,"},{"from":4187.23,"to":4189.81,"location":2,"content":"we kind of broke down as these more low level, uh,"},{"from":4189.81,"to":4191.69,"location":2,"content":"components of quality such as,"},{"from":4191.69,"to":4193.29,"location":2,"content":"you know, uh, were you interesting?"},{"from":4193.29,"to":4195.15,"location":2,"content":"Were you li- showing that you were listening?"},{"from":4195.15,"to":4196.94,"location":2,"content":"Were you asking enough questions and so on?"},{"from":4196.94,"to":4199.62,"location":2,"content":"And then below that, we had these kind of controllable attributes which"},{"from":4199.62,"to":4202.47,"location":2,"content":"were the knobs that we were turning and then the goal was to figure out,"},{"from":4202.47,"to":4204.52,"location":2,"content":"um, how these things affected the output."},{"from":4204.52,"to":4209.74,"location":2,"content":"Um, so let's see."},{"from":4209.74,"to":4213.12,"location":2,"content":"Um, so we had a bunch of findings here, and I think,"},{"from":4213.12,"to":4216.44,"location":2,"content":"maybe the ones which I will highlight were,"},{"from":4216.44,"to":4218.06,"location":2,"content":"uh, these two kind of in the middle."},{"from":4218.06,"to":4219.98,"location":2,"content":"So the overall metric engagingness,"},{"from":4219.98,"to":4223.09,"location":2,"content":"which means enjoyment, that was really easy to maximize."},{"from":4223.09,"to":4224.42,"location":2,"content":"It turned out, uh,"},{"from":4224.42,"to":4228.3,"location":2,"content":"our bots managed to get near human performance in terms of engagingness."},{"from":4228.3,"to":4230.73,"location":2,"content":"Um, but the overall metric humanness,"},{"from":4230.73,"to":4232.44,"location":2,"content":"that is the kind of Turing test metric,"},{"from":4232.44,"to":4234.4,"location":2,"content":"that was not at all easy to maximize."},{"from":4234.4,"to":4235.92,"location":2,"content":"All of our bots were way,"},{"from":4235.92,"to":4238.02,"location":2,"content":"way below humans in terms of humanness, right?"},{"from":4238.02,"to":4240.3,"location":2,"content":"So we were not at all convincing of being human,"},{"from":4240.3,"to":4241.78,"location":2,"content":"and this is kind of interesting, right?"},{"from":4241.78,"to":4244.03,"location":2,"content":"Like, we were as enjoyable as talk to as humans,"},{"from":4244.03,"to":4246.63,"location":2,"content":"but we were clearly not human, right?"},{"from":4246.63,"to":4250.24,"location":2,"content":"So like, humanness is not the same thing as conversational quality."},{"from":4250.24,"to":4252.61,"location":2,"content":"And one of the interesting things we found in this,"},{"from":4252.61,"to":4255.39,"location":2,"content":"um, study, where we not only evaluated our chatbots,"},{"from":4255.39,"to":4257.65,"location":2,"content":"we also actually got humans to evaluate each other,"},{"from":4257.65,"to":4260.93,"location":2,"content":"was that, um, humans are sub-optimal conversationalists."},{"from":4260.93,"to":4265.17,"location":2,"content":"Uh, they scored pretty poorly on interestingness, fluency, listening."},{"from":4265.17,"to":4266.9,"location":2,"content":"They didn't ask each other enough questions,"},{"from":4266.9,"to":4269.46,"location":2,"content":"and this is kind of the reason why we managed to like approach"},{"from":4269.46,"to":4273.15,"location":2,"content":"human performance in kind of enjoyableness to talk to you because we just,"},{"from":4273.15,"to":4276.5,"location":2,"content":"for example, turned up the question asking knob, asked more questions,"},{"from":4276.5,"to":4279.88,"location":2,"content":"and people responded really well to that because people like talking about themselves."},{"from":4279.88,"to":4282.29,"location":2,"content":"So, um, yeah."},{"from":4282.29,"to":4283.61,"location":2,"content":"I think this is kind of interesting, right?"},{"from":4283.61,"to":4286.82,"location":2,"content":"Because it shows that there is no obvious just one question to ask, right?"},{"from":4286.82,"to":4287.99,"location":2,"content":"Because if you just seemed, \"Oh,"},{"from":4287.99,"to":4291.87,"location":2,"content":"the one question to ask is clearly engagingness or it's clearly humanness,"},{"from":4291.87,"to":4295.36,"location":2,"content":"then we would have gotten completely different reads on how well we were doing, right?"},{"from":4295.36,"to":4301.73,"location":2,"content":"Whereas asking these multiple questions kind of gives you more of an overview."},{"from":4301.73,"to":4306.78,"location":2,"content":"I am going to skip this just because there's not a lot of time."},{"from":4306.78,"to":4308.6,"location":2,"content":"Okay. So, here's the final section."},{"from":4308.6,"to":4311.67,"location":2,"content":"Uh, this is my kind of wrap-up thoughts on NLG research,"},{"from":4311.67,"to":4314.34,"location":2,"content":"the current trends and where we're going in the future."},{"from":4314.34,"to":4319.02,"location":2,"content":"So, here's kind of three exciting current trends to identify in NLG."},{"from":4319.02,"to":4320.98,"location":2,"content":"And of course your mileage may vary,"},{"from":4320.98,"to":4322.86,"location":2,"content":"you might think that other things are more interesting."},{"from":4322.86,"to":4325.11,"location":2,"content":"So, uh, the ones which I was thinking about, are"},{"from":4325.11,"to":4328.64,"location":2,"content":"firstly incorporating discrete latent variables into NLG."},{"from":4328.64,"to":4331.14,"location":2,"content":"Um, so, you should go check out"},{"from":4331.14,"to":4333.59,"location":2,"content":"the slides I skipped over because there were some examples of this."},{"from":4333.59,"to":4336.96,"location":2,"content":"But the idea is that with some tasks such as for example"},{"from":4336.96,"to":4339.36,"location":2,"content":"storytelling or task oriented dialogue"},{"from":4339.36,"to":4341.1,"location":2,"content":"where you're trying to actually get something done."},{"from":4341.1,"to":4342.81,"location":2,"content":"Um, you probably want a more kind of"},{"from":4342.81,"to":4345.54,"location":2,"content":"concrete hard notion of the things that you're talking about"},{"from":4345.54,"to":4350.16,"location":2,"content":"like you know, entities and people and events and negotiation and so on."},{"from":4350.16,"to":4353.81,"location":2,"content":"So, uh, there's, there's mentioning what kind of modeling"},{"from":4353.81,"to":4358.86,"location":2,"content":"these discrete latent variables inside these continuous, uh, NLG methods."},{"from":4358.86,"to":4362.52,"location":2,"content":"The second one is alternatives to strict left to right generation."},{"from":4362.52,"to":4364.44,"location":2,"content":"And I'm really sorry [LAUGHTER] I skipped over so many things."},{"from":4364.44,"to":4367.02,"location":2,"content":"Um, so, there's some interesting work recently in trying"},{"from":4367.02,"to":4369.81,"location":2,"content":"to generate text in ways other than left to right."},{"from":4369.81,"to":4371.16,"location":2,"content":"So, for example there's some kind of"},{"from":4371.16,"to":4375.73,"location":2,"content":"parallel generation stuff or maybe writing something and iteratively refining it, uh,"},{"from":4375.73,"to":4379.81,"location":2,"content":"there's also the idea of kind of top-down generation, um, for"},{"from":4379.81,"to":4382.8,"location":2,"content":"especially longer pieces of text like maybe tried to decide the contents"},{"from":4382.8,"to":4386.39,"location":2,"content":"of each of the sentences separately before uh, writing the words."},{"from":4386.39,"to":4388.62,"location":2,"content":"And then a third one is like"},{"from":4388.62,"to":4391.53,"location":2,"content":"alternatives to maximum likelihood training with teacher forcing."},{"from":4391.53,"to":4394.32,"location":2,"content":"So, to remind you, a maximum likelihood training with teacher forcing is"},{"from":4394.32,"to":4396.42,"location":2,"content":"just the standard method of training"},{"from":4396.42,"to":4399.21,"location":2,"content":"a language model that we've been telling you about in the class so far."},{"from":4399.21,"to":4400.76,"location":2,"content":"Um, so, you know,"},{"from":4400.76,"to":4403.2,"location":2,"content":"there's some interesting work on looking at more kind of holistic,"},{"from":4403.2,"to":4405.73,"location":2,"content":"um, sentence level rather than word level objectives."},{"from":4405.73,"to":4407.55,"location":2,"content":"Uh, so, unfortunately I ran out of time with"},{"from":4407.55,"to":4409.86,"location":2,"content":"this slide, and I didn't have time to put the references in but I will"},{"from":4409.86,"to":4411.99,"location":2,"content":"put the references in later and it"},{"from":4411.99,"to":4414.94,"location":2,"content":"will be on the course website so you can go check them out later."},{"from":4414.94,"to":4419.82,"location":2,"content":"Okay. So, as a kind of overview, NLG research, where are we and where are we going?"},{"from":4419.82,"to":4421.95,"location":2,"content":"Um, so my metaphor is I think that"},{"from":4421.95,"to":4426.21,"location":2,"content":"about five years ago NLP and deep learning research was a kind of a Wild West."},{"from":4426.21,"to":4430.77,"location":2,"content":"Right? Like everything was new and um, we were unsure,"},{"from":4430.77,"to":4432.3,"location":2,"content":"NLP research weren't sure what kind of what"},{"from":4432.3,"to":4435.66,"location":2,"content":"the new research landscape was because uh, you know,"},{"from":4435.66,"to":4438.65,"location":2,"content":"uh, neural methods kind of changed machine translation a lot,"},{"from":4438.65,"to":4441.68,"location":2,"content":"looked like they might change other areas but it was uncertain how much."},{"from":4441.68,"to":4444.69,"location":2,"content":"Um, but these days you know five years later,"},{"from":4444.69,"to":4446.48,"location":2,"content":"um, it's a lot less wild."},{"from":4446.48,"to":4449.13,"location":2,"content":"I'd say, you know things are settled down a lot kind of"},{"from":4449.13,"to":4453.14,"location":2,"content":"standard practices have emerged and sure there's still a lot of things changing."},{"from":4453.14,"to":4455.24,"location":2,"content":"Um, but you know there's more people in the community,"},{"from":4455.24,"to":4456.5,"location":2,"content":"there's more standard practices,"},{"from":4456.5,"to":4458.24,"location":2,"content":"we have things like TensorFlow and PyTorch."},{"from":4458.24,"to":4460.09,"location":2,"content":"So, you don't have to take up gradients anymore."},{"from":4460.09,"to":4462.66,"location":2,"content":"So, I'd say things are a lot less wild now"},{"from":4462.66,"to":4466.37,"location":2,"content":"but I would say NLG does seem to be one of the wildest parts"},{"from":4466.37,"to":4469.88,"location":2,"content":"remaining and part of the reasons for that is because of"},{"from":4469.88,"to":4473.91,"location":2,"content":"the lack of evaluation metrics that makes it so difficult to tell what we're doing."},{"from":4473.91,"to":4477.26,"location":2,"content":"It's, uh, quite difficult to identify like what are the main methods that are"},{"from":4477.26,"to":4481.88,"location":2,"content":"working when we don't have any metrics that can clearly tell us what's going on."},{"from":4481.88,"to":4484.71,"location":2,"content":"So, another thing that I'm really glad to see is that"},{"from":4484.71,"to":4487.83,"location":2,"content":"the neural NLG community is rapidly expanding."},{"from":4487.83,"to":4491.04,"location":2,"content":"Um, so, in the early years, uh,"},{"from":4491.04,"to":4495.39,"location":2,"content":"people were mostly transferring successful NMT methods to various NLG tasks."},{"from":4495.39,"to":4498.87,"location":2,"content":"Uh, but now I'm seeing you know, increasingly more inventive NLG techniques"},{"from":4498.87,"to":4502.73,"location":2,"content":"merging which is specific to the non-NMT generation settings."},{"from":4502.73,"to":4505.85,"location":2,"content":"Um, and again I urge you to go back into the slides that I skipped."},{"from":4505.85,"to":4508.65,"location":2,"content":"Um, so, I'm also saying there's increasingly more kind of"},{"from":4508.65,"to":4511.59,"location":2,"content":"neural NLG workshops and competitions especially"},{"from":4511.59,"to":4514.47,"location":2,"content":"focusing on open-ended NLG like those tasks that we"},{"from":4514.47,"to":4518.06,"location":2,"content":"know are not well suited by the automatic metrics that work for NMT."},{"from":4518.06,"to":4522.72,"location":2,"content":"So, there's a neural generation workshop, a storytelling workshop uh,"},{"from":4522.72,"to":4526.47,"location":2,"content":"and various challenges as well where people enter their for example, um,"},{"from":4526.47,"to":4528.87,"location":2,"content":"conversational dialogue agents to be,"},{"from":4528.87,"to":4531.49,"location":2,"content":"um, evaluated against each other."},{"from":4531.49,"to":4533.52,"location":2,"content":"So, I think that these different, um,"},{"from":4533.52,"to":4535.35,"location":2,"content":"kind of community organizing workshops and"},{"from":4535.35,"to":4538.71,"location":2,"content":"competitions are really doing a great job to kind of organize a community,"},{"from":4538.71,"to":4544.08,"location":2,"content":"increase reproducibility and standard evaluate, standardized evaluation."},{"from":4544.08,"to":4546.3,"location":2,"content":"Um, so, this is great but I'd say"},{"from":4546.3,"to":4550.23,"location":2,"content":"the biggest roadblock to progress is definitely still evaluation."},{"from":4550.23,"to":4553.31,"location":2,"content":"Okay. So, the last thing that I want to share with you"},{"from":4553.31,"to":4556.26,"location":2,"content":"is eight things that I've learned from working in NLG."},{"from":4556.26,"to":4558.93,"location":2,"content":"So, the first one is the more open-ended the task,"},{"from":4558.93,"to":4560.53,"location":2,"content":"the harder everything becomes."},{"from":4560.53,"to":4563.65,"location":2,"content":"Evaluation becomes harder, defining what you're doing becomes harder,"},{"from":4563.65,"to":4565.9,"location":2,"content":"telling when you're doing a good job becomes harder."},{"from":4565.9,"to":4569.13,"location":2,"content":"So, for this reason constraints can sometimes make things more welcome."},{"from":4569.13,"to":4575.37,"location":2,"content":"So, if you decide to constrain your task then sometimes it's easier to, to complete it."},{"from":4575.37,"to":4579.12,"location":2,"content":"Uh, the next one is aiming for a specific improvement can"},{"from":4579.12,"to":4582.68,"location":2,"content":"often be more manageable than aiming to improve overall generation quality."},{"from":4582.68,"to":4585.28,"location":2,"content":"So, for example, if you decide that you want to"},{"from":4585.28,"to":4587.86,"location":2,"content":"well for example increase diversity for your model, like say"},{"from":4587.86,"to":4591.27,"location":2,"content":"more interesting things that's an easier thing to achieve and measure than just"},{"from":4591.27,"to":4595.88,"location":2,"content":"saying we want to do overall generation quality because of the evaluation problem."},{"from":4595.88,"to":4600.28,"location":2,"content":"The next one is if you're using your language model to do NLG,"},{"from":4600.28,"to":4604.86,"location":2,"content":"then improving the language model that is getting better with perplexity will give you"},{"from":4604.86,"to":4606.96,"location":2,"content":"probably better generation quality because you've got"},{"from":4606.96,"to":4611.22,"location":2,"content":"a stronger language model but it's not the only way to improve generation quality,"},{"from":4611.22,"to":4613.06,"location":2,"content":"as we talked about before, uh,"},{"from":4613.06,"to":4616.99,"location":2,"content":"there's also other components that can affect generation apart from just language model,"},{"from":4616.99,"to":4620.34,"location":2,"content":"and that's part of the problem is that that's not in the training objective."},{"from":4620.34,"to":4623.65,"location":2,"content":"Um, my next tip is that you should look at your output a lot,"},{"from":4623.65,"to":4627.15,"location":2,"content":"partially because you don't have any single metric that can tell you what's going on."},{"from":4627.15,"to":4630.27,"location":2,"content":"It's pretty important to look at your output a lot to form your own opinions."},{"from":4630.27,"to":4632.74,"location":2,"content":"It can be time consuming but it's probably worth doing."},{"from":4632.74,"to":4634.57,"location":2,"content":"I ended up talking to these chatbots"},{"from":4634.57,"to":4637.44,"location":2,"content":"a huge amount during the time that I was working on the project."},{"from":4637.44,"to":4641.64,"location":2,"content":"Okay. Almost done, so, five you need an automatic metric, even if it's imperfect."},{"from":4641.64,"to":4643.05,"location":2,"content":"So, I know you that already know this because we"},{"from":4643.05,"to":4645.14,"location":2,"content":"wrote it all over the project instructions."},{"from":4645.14,"to":4649.2,"location":2,"content":"Uh, but I'd probably amend that to like maybe you need several automatic metrics."},{"from":4649.2,"to":4650.76,"location":2,"content":"I talked earlier about how you might track"},{"from":4650.76,"to":4653.44,"location":2,"content":"multiple things to get an overall picture of what's going on,"},{"from":4653.44,"to":4656.1,"location":2,"content":"I'd say the more open-ended your NLG task is,"},{"from":4656.1,"to":4659.18,"location":2,"content":"the more likely you need probably several metrics."},{"from":4659.18,"to":4663.19,"location":2,"content":"If you do human eval, you want to make the questions as focused as possible."},{"from":4663.19,"to":4665.1,"location":2,"content":"So, as I found out the hard way if you"},{"from":4665.1,"to":4667.78,"location":2,"content":"define the question as a very kind of overall vague thing,"},{"from":4667.78,"to":4669.89,"location":2,"content":"then you're just opening yourself up to, um,"},{"from":4669.89,"to":4672.98,"location":2,"content":"the respondents kind of misunderstanding you and, uh,"},{"from":4672.98,"to":4674.67,"location":2,"content":"if they are doing that then it's actually not their fault,"},{"from":4674.67,"to":4677.48,"location":2,"content":"it's your fault and you need to take your questions and that's what I learned."},{"from":4677.48,"to":4679.86,"location":2,"content":"Uh, next thing is reproducibility is"},{"from":4679.86,"to":4683.58,"location":2,"content":"a huge problem in today's NLP and deep learning in general,"},{"from":4683.58,"to":4686.13,"location":2,"content":"and the problem is only bigger in NLG,"},{"from":4686.13,"to":4688.38,"location":2,"content":"I guess it's another way that it's still a wild west."},{"from":4688.38,"to":4690.93,"location":2,"content":"So, I'd say that, uh, it would be really great,"},{"from":4690.93,"to":4693.3,"location":2,"content":"if everybody could publicly release all of"},{"from":4693.3,"to":4695.98,"location":2,"content":"their generated output when they write NLG papers."},{"from":4695.98,"to":4700.15,"location":2,"content":"I think this is a great practice because if you released your generated outputs,"},{"from":4700.15,"to":4703.88,"location":2,"content":"then if someone later let's say comes up with a great automatic metric,"},{"from":4703.88,"to":4707.98,"location":2,"content":"then they can just grab your generated output and then compute the metric on that."},{"from":4707.98,"to":4710.04,"location":2,"content":"Whereas if he never released your output or you"},{"from":4710.04,"to":4712.47,"location":2,"content":"released with some kind of imperfect metric number,"},{"from":4712.47,"to":4715.02,"location":2,"content":"then future researchers have nothing to compare it against."},{"from":4715.02,"to":4718.57,"location":2,"content":"Uh, so lastly, my last thought"},{"from":4718.57,"to":4722.79,"location":2,"content":"about working in NLG is that it can be very frustrating sometimes,"},{"from":4722.79,"to":4725.74,"location":2,"content":"uh, because things can be difficult and it's hard to know when you're making progress."},{"from":4725.74,"to":4728.81,"location":2,"content":"But the upside is it can also be very funny."},{"from":4728.81,"to":4732.74,"location":2,"content":"So this my last slide, here are some bizarre conversations that I've had with my chatbot."},{"from":4732.74,"to":4734,"location":2,"content":"[LAUGHTER] Thanks."},{"from":4734,"to":4777,"location":2,"content":"[NOISE] [LAUGHTER] All right, thanks."}]} \ No newline at end of file diff --git a/bcc-en/16.bcc b/bcc-en/16.bcc new file mode 100644 index 0000000000000000000000000000000000000000..d243cc308dad59e73a31c1157cce17b5d8856d5f --- /dev/null +++ b/bcc-en/16.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.79,"to":9.66,"location":2,"content":"Hi everybody, time to get started."},{"from":9.66,"to":18.75,"location":2,"content":"Okay. Um, so, so today what we're gonna talk about is a topic that's, um,"},{"from":18.75,"to":23.75,"location":2,"content":"coreference resolution and I'll explain in just a minute what that is,"},{"from":23.75,"to":27.06,"location":2,"content":"um, but before getting on to that just a,"},{"from":27.06,"to":29.57,"location":2,"content":"uh, couple of words on the announcements."},{"from":29.57,"to":35.77,"location":2,"content":"Um, so the TAs are feverishly working on getting homework five grades worked out,"},{"from":35.77,"to":39.17,"location":2,"content":"so we hope that we can deliver those to you, um,"},{"from":39.17,"to":41.6,"location":2,"content":"tomorrow just in case you're anxious to know"},{"from":41.6,"to":44.75,"location":2,"content":"them before you make your final decisions about things."},{"from":44.75,"to":48.29,"location":2,"content":"And then, the other thing that you should be remembering"},{"from":48.29,"to":52.58,"location":2,"content":"is that the milestone for the final project is this Tuesday."},{"from":52.58,"to":55.78,"location":2,"content":"Now, I will confess that even to me it seems like,"},{"from":55.78,"to":58.92,"location":2,"content":"\"Boy, boy this milestone came around really quickly.\""},{"from":58.92,"to":61.68,"location":2,"content":"So you probably feel that doubly, I realize."},{"from":61.68,"to":66.53,"location":2,"content":"And so you know, I do apologize for that a little bit,"},{"from":66.53,"to":71.44,"location":2,"content":"but you know, really our hope was that we could actually use this to be helpful,"},{"from":71.44,"to":75.05,"location":2,"content":"and to give you feedback on what you're doing and suggestions,"},{"from":75.05,"to":77.12,"location":2,"content":"and it just really seemed like, well,"},{"from":77.12,"to":79.88,"location":2,"content":"the only chance in which we can kind of, um,"},{"from":79.88,"to":83.9,"location":2,"content":"turn around giving more feedback on the projects, um,"},{"from":83.9,"to":88.83,"location":2,"content":"before it goes into the final week of the quarter is if we can kind of get stuff,"},{"from":88.83,"to":90.71,"location":2,"content":"um, Tuesday, and hope to be then,"},{"from":90.71,"to":92.81,"location":2,"content":"sort of turning it around again by the end of the week."},{"from":92.81,"to":96.04,"location":2,"content":"So the hope is to help you not to just,"},{"from":96.04,"to":99.69,"location":2,"content":"um, create obstacles and roadblocks in your life."},{"from":99.69,"to":104.42,"location":2,"content":"Okay. So today what we're gonna do, um, is, uh,"},{"from":104.42,"to":107.75,"location":2,"content":"learn more about a linguistic topic for a change and learn"},{"from":107.75,"to":111.92,"location":2,"content":"some more stuff about what goes on in coreference resolution."},{"from":111.92,"to":114.19,"location":2,"content":"So first of all, I'm gonna talk about the task,"},{"from":114.19,"to":117.38,"location":2,"content":"and then go on to some of the kinds of models that people,"},{"from":117.38,"to":119.99,"location":2,"content":"um, do for coreference resolution."},{"from":119.99,"to":123.03,"location":2,"content":"So first of all, what is it?"},{"from":123.03,"to":128.85,"location":2,"content":"Um, so the idea of coreference resolution is what we do, which we have a text,"},{"from":128.85,"to":133.18,"location":2,"content":"\"Barack Obama nominated Hillary Rodham Clinton as his Secretary of State on"},{"from":133.18,"to":138.23,"location":2,"content":"Monday,\" and this text like most texts are about entities,"},{"from":138.23,"to":141.2,"location":2,"content":"where entities are commonly human beings,"},{"from":141.2,"to":146,"location":2,"content":"but they can be other things like God saw talking giraffes or whatever it is."},{"from":146,"to":148.58,"location":2,"content":"So it seems like we want to make,"},{"from":148.58,"to":152.22,"location":2,"content":"find where entities are mentioned."},{"from":152.22,"to":154.04,"location":2,"content":"So my entities are mentioned,"},{"from":154.04,"to":155.99,"location":2,"content":"they're referred to as mentions."},{"from":155.99,"to":159.5,"location":2,"content":"So things like Barack Obama and Secretary of State,"},{"from":159.5,"to":163.26,"location":2,"content":"he, her, they are mentions of entities."},{"from":163.26,"to":166.99,"location":2,"content":"And then, when we talk about coreference resolution,"},{"from":166.99,"to":170.27,"location":2,"content":"the task that we're wanting to do is say,"},{"from":170.27,"to":174.68,"location":2,"content":"which of these mentions refer to the same entity,"},{"from":174.68,"to":177.17,"location":2,"content":"the same real thing in the world."},{"from":177.17,"to":182.33,"location":2,"content":"So well, one entity that's mentioned in this text is Barack Obama,"},{"from":182.33,"to":186.92,"location":2,"content":"and then he's referred to later in the text as his and he,"},{"from":186.92,"to":192.43,"location":2,"content":"and so these three red noun phrases are all coreferent to each other."},{"from":192.43,"to":197.27,"location":2,"content":"And that then, refers to this real-world entity."},{"from":197.27,"to":201.11,"location":2,"content":"Um, and then, we have these references Hillary Rodham Clinton,"},{"from":201.11,"to":202.7,"location":2,"content":"Secretary of State, her,"},{"from":202.7,"to":208.44,"location":2,"content":"she, First Lady, they're all references to a different entity."},{"from":208.44,"to":211.52,"location":2,"content":"And so they all refer to this person."},{"from":211.52,"to":214.39,"location":2,"content":"And so those are examples of our coreference."},{"from":214.39,"to":220.75,"location":2,"content":"Um, in a way this is triv- sort of seems obvious to a human being,"},{"from":220.75,"to":222.63,"location":2,"content":"um, looking at things, um,"},{"from":222.63,"to":225.59,"location":2,"content":"but it can actually be kind of tricky and hard."},{"from":225.59,"to":230.48,"location":2,"content":"Um, so, um, I thought we could spend a few minutes doing"},{"from":230.48,"to":235.66,"location":2,"content":"interactive working out coreferents together so that you guys can,"},{"from":235.66,"to":237.98,"location":2,"content":"um, think about it all for a few minutes."},{"from":237.98,"to":241.01,"location":2,"content":"Um, so here's part of a little story."},{"from":241.01,"to":244.27,"location":2,"content":"Um, it's a story by Shruthi Rao called The Star."},{"from":244.27,"to":248.09,"location":2,"content":"Um, now, I confess that since this is a CS class,"},{"from":248.09,"to":250.25,"location":2,"content":"um, not a literature class,"},{"from":250.25,"to":251.84,"location":2,"content":"I did a little bit of, um,"},{"from":251.84,"to":255.05,"location":2,"content":"helpful editing of this text to make it shorter,"},{"from":255.05,"to":256.63,"location":2,"content":"so I could fit more of,"},{"from":256.63,"to":258.8,"location":2,"content":"what was going on, um,"},{"from":258.8,"to":262.14,"location":2,"content":"onto the page, um, but, um,"},{"from":262.14,"to":264.44,"location":2,"content":"everything that is a sort of a linguistic"},{"from":264.44,"to":267.3,"location":2,"content":"[inaudible] is something that comes from the original text."},{"from":267.3,"to":271.61,"location":2,"content":"Okay. So, um, in this text,"},{"from":271.61,"to":277.97,"location":2,"content":"um, who is the first entity that's mentioned?"},{"from":277.97,"to":281.1,"location":2,"content":"Vanaja, okay."},{"from":281.1,"to":282.57,"location":2,"content":"Okay. So it's Vanaja."},{"from":282.57,"to":285.25,"location":2,"content":"Now, where, let's do it forward."},{"from":285.25,"to":292.58,"location":2,"content":"Where else is Vanaja mentioned in this text?"},{"from":292.58,"to":294.72,"location":2,"content":"Her son, right?"},{"from":294.72,"to":296.46,"location":2,"content":"So this her not the son,"},{"from":296.46,"to":305.12,"location":2,"content":"but this her is a reference of Vanaja, right?"},{"from":305.12,"to":308.52,"location":2,"content":"Um, she resigned."},{"from":308.52,"to":312.09,"location":2,"content":"Okay. After that?"},{"from":312.09,"to":315.81,"location":2,"content":"She bought."},{"from":315.81,"to":317.54,"location":2,"content":"Okay. So there's another she."},{"from":317.54,"to":320.2,"location":2,"content":"Was there another reference before that?"},{"from":320.2,"to":325.28,"location":2,"content":"Herself, right? So herself is also a reference to Vanaja."},{"from":325.28,"to":327.77,"location":2,"content":"Um, okay. So then, it's again,"},{"from":327.77,"to":332.75,"location":2,"content":"she made this, she, okay."},{"from":332.75,"to":333.98,"location":2,"content":"So we've done Vanaja."},{"from":333.98,"to":336.14,"location":2,"content":"Okay, that's a good start."},{"from":336.14,"to":340.27,"location":2,"content":"Okay. So then, um, we've got Akhila."},{"from":340.27,"to":345.5,"location":2,"content":"Okay. Um, where's Akhila next referred to?"},{"from":345.5,"to":348.45,"location":2,"content":"As Akhila. Okay, there we go."},{"from":348.45,"to":358.28,"location":2,"content":"Um, are there other references, um, to Akhila?"},{"from":358.28,"to":367.82,"location":2,"content":"Maybe not. Okay. What's the next entity that's mentioned?"},{"from":367.82,"to":370.2,"location":2,"content":"Prajwal."},{"from":370.2,"to":379.06,"location":2,"content":"Okay. So what other references are there to Prajwal?"},{"from":379.06,"to":380.18,"location":2,"content":"They."},{"from":380.18,"to":384.33,"location":2,"content":"They? Okay. So here's a tricky one, right?"},{"from":384.33,"to":386.43,"location":2,"content":"So this they, I mean,"},{"from":386.43,"to":390.44,"location":2,"content":"who does that refer to?"},{"from":390.44,"to":395.79,"location":2,"content":"It occ- refers to Prajwal and Akash."},{"from":395.79,"to":400.5,"location":2,"content":"Yeah, so this they refers both to Prajwal and this Akash."},{"from":400.5,"to":404.18,"location":2,"content":"So that's, that's something that happens in human languages."},{"from":404.18,"to":406.79,"location":2,"content":"This is referred to as split antecedents,"},{"from":406.79,"to":409.22,"location":2,"content":"where you have one thing that they,"},{"from":409.22,"to":414.25,"location":2,"content":"that's sort of referring to two distributed things that came before it."},{"from":414.25,"to":421.08,"location":2,"content":"Um, so here's one of my first sad admissions of natural language processing technology."},{"from":421.08,"to":424.94,"location":2,"content":"None of the NLP systems that we're gonna talk about later"},{"from":424.94,"to":429.92,"location":2,"content":"today or in general that have been built deal with split antecedents."},{"from":429.92,"to":433.79,"location":2,"content":"They automatically lose as soon as there's split antecedents."},{"from":433.79,"to":435.44,"location":2,"content":"Um, so that's a bit sad,"},{"from":435.44,"to":437.3,"location":2,"content":"um, but that's the state of technology."},{"from":437.3,"to":438.71,"location":2,"content":"So it's something, um,"},{"from":438.71,"to":440.39,"location":2,"content":"we could still work to improve,"},{"from":440.39,"to":446.03,"location":2,"content":"but okay there's this sort of they that's kind of half Prajwal. Um, okay."},{"from":446.03,"to":449.19,"location":2,"content":"So there's directly Prajwal here,"},{"from":449.19,"to":460.19,"location":2,"content":"but was there another place early in the text that Prajwal is effectively mentioned?"},{"from":460.19,"to":466.56,"location":2,"content":"Yeah. So Akhila's son is really another mention of Prajwal, right?"},{"from":466.56,"to":472.59,"location":2,"content":"Okay. Um, okay."},{"from":472.59,"to":476.13,"location":2,"content":"Um, any other mentions of Prajwal? Maybe not."},{"from":476.13,"to":477.76,"location":2,"content":"Okay. Then we go on."},{"from":477.76,"to":480.31,"location":2,"content":"Okay. Who's the next entity?"},{"from":480.31,"to":484.31,"location":2,"content":"Akash. So we have Akash here,"},{"from":484.31,"to":485.7,"location":2,"content":"and that then again,"},{"from":485.7,"to":489.15,"location":2,"content":"we have that her son referring to Akash."},{"from":489.15,"to":492.44,"location":2,"content":"Um, and here was Akash."},{"from":492.44,"to":500.45,"location":2,"content":"Okay. What other, what other mentions of Akash are there?"},{"from":500.45,"to":509.01,"location":2,"content":"Okay so there's another Akash here, um, fourth him."},{"from":509.01,"to":515.34,"location":2,"content":"Okay. Uh, there's another Akash."},{"from":515.34,"to":522.55,"location":2,"content":"Okay, um, but, so, um, here."},{"from":522.55,"to":525.33,"location":2,"content":"Okay. So are the obvious Akash's."},{"from":525.33,"to":527.76,"location":2,"content":"There's sort of a tricky case here which"},{"from":527.76,"to":530.28,"location":2,"content":"you could wonder what the right treatment of this, right?"},{"from":530.28,"to":535.8,"location":2,"content":"You know, it's sort of says Akash was to be a tree, all right."},{"from":535.8,"to":540.2,"location":2,"content":"So in some sense the tree is Akash."},{"from":540.2,"to":545.43,"location":2,"content":"Um, so really in terms of reference in this story,"},{"from":545.43,"to":549.99,"location":2,"content":"the reference of the tree is the same as Akash."},{"from":549.99,"to":552.79,"location":2,"content":"And you could think, um,"},{"from":552.79,"to":556.44,"location":2,"content":"that means you should treat the instances of,"},{"from":556.44,"to":558.64,"location":2,"content":"um, the tree, the,"},{"from":558.64,"to":560.92,"location":2,"content":"the instances here of the tree,"},{"from":560.92,"to":565.5,"location":2,"content":"and later on when the nicest tree right that really,"},{"from":565.5,"to":568.22,"location":2,"content":"that's sort of this Akash as well."},{"from":568.22,"to":570.25,"location":2,"content":"That doesn't quite feel right,"},{"from":570.25,"to":573.03,"location":2,"content":"but this is something that comes up in coreference, right?"},{"from":573.03,"to":578.14,"location":2,"content":"So here we have a sort of a predictive construction um,"},{"from":578.14,"to":579.77,"location":2,"content":"with, you know, B."},{"from":579.77,"to":581.81,"location":2,"content":"And when you set,"},{"from":581.81,"to":586.76,"location":2,"content":"when you have sentences such as like, um, you know,"},{"from":586.76,"to":592.27,"location":2,"content":"my child is the smartest kid in the class or something like that, in some sense,"},{"from":592.27,"to":594.96,"location":2,"content":"you're sort of saying that the smartest kid in"},{"from":594.96,"to":598.56,"location":2,"content":"the class has the same reference as my child."},{"from":598.56,"to":604.63,"location":2,"content":"And some systems count links over that kind of predication,"},{"from":604.63,"to":607.22,"location":2,"content":"and say that is coreference whereas"},{"from":607.22,"to":610.89,"location":2,"content":"other ones don't and think that that's not quite reasonable."},{"from":610.89,"to":612.78,"location":2,"content":"So different things go on."},{"from":612.78,"to":614.99,"location":2,"content":"Okay. So, um, those,"},{"from":614.99,"to":617.92,"location":2,"content":"those are fair number of entities."},{"from":617.92,"to":623.02,"location":2,"content":"I mean, so there are obviously lots of other things that are mentioned,"},{"from":623.02,"to":625.44,"location":2,"content":"um that sort of, um, right?"},{"from":625.44,"to":627.66,"location":2,"content":"So there's the local park, right,"},{"from":627.66,"to":630.45,"location":2,"content":"that's a mention of some entity."},{"from":630.45,"to":636.36,"location":2,"content":"Um, there's, um, the school, um right?"},{"from":636.36,"to":644.48,"location":2,"content":"So there's this school here and so that the school is coreferent with pre,"},{"from":644.48,"to":647.37,"location":2,"content":"the preschool right here, right?"},{"from":647.37,"to":649.38,"location":2,"content":"Um, and then there's,"},{"from":649.38,"to":652.15,"location":2,"content":"um, again this sort of tricky one,"},{"from":652.15,"to":656.09,"location":2,"content":"of how to treat the naughty child Lord Krishna because,"},{"from":656.09,"to":659.52,"location":2,"content":"you know, in some sense Prajwal is representing that."},{"from":659.52,"to":662.51,"location":2,"content":"And then there are lots of other entities that are mentioned, right?"},{"from":662.51,"to":665.2,"location":2,"content":"There's a t-shirt, and there's trousers,"},{"from":665.2,"to":668.36,"location":2,"content":"um, and, um, things like that."},{"from":668.36,"to":672.06,"location":2,"content":"Another tricky thing that turns up here when you get later on into"},{"from":672.06,"to":676.95,"location":2,"content":"the story is you can have entities that have parts."},{"from":676.95,"to":679.42,"location":2,"content":"So we not only have a tree,"},{"from":679.42,"to":681.84,"location":2,"content":"but that tree then has a lot of parts, right?"},{"from":681.84,"to":683.37,"location":2,"content":"So the tree has a trunk,"},{"from":683.37,"to":685.17,"location":2,"content":"and the tree has foliage,"},{"from":685.17,"to":688.27,"location":2,"content":"um, and things like that."},{"from":688.27,"to":691.17,"location":2,"content":"And there are these red balls that are representing fruits, right?"},{"from":691.17,"to":695.4,"location":2,"content":"So there's a lot of stuff that's somehow connected together and somehow separate."},{"from":695.4,"to":699.33,"location":2,"content":"And that sort of, that doesn't fit terribly well with the kind of models we"},{"from":699.33,"to":703.42,"location":2,"content":"use with coreference either because really we make our coreference,"},{"from":703.42,"to":708.01,"location":2,"content":"um, reference models basically out of this notion of entities."},{"from":708.01,"to":710.33,"location":2,"content":"Um, but somehow there's this complexity that,"},{"from":710.33,"to":712.29,"location":2,"content":"you know, human beings have parts too, right?"},{"from":712.29,"to":713.74,"location":2,"content":"We have hands and faces,"},{"from":713.74,"to":716.14,"location":2,"content":"and we can't say, oh, that's a separate entity,"},{"from":716.14,"to":719.89,"location":2,"content":"but they're somehow in, um, involved with the other entity."},{"from":719.89,"to":724.74,"location":2,"content":"Okay. Um, hope that's sort of useful to give some idea."},{"from":724.74,"to":727.66,"location":2,"content":"Why is coreference resolution useful?"},{"from":727.66,"to":731.89,"location":2,"content":"Um, so there are all kinds of things that we'd like to do well"},{"from":731.89,"to":736.38,"location":2,"content":"in natural language processing that you really can't do well unless,"},{"from":736.38,"to":739.85,"location":2,"content":"uh, you know how to do coreference resolution."},{"from":739.85,"to":745.41,"location":2,"content":"So anything that we want to do in terms of question-answering, summarization,"},{"from":745.41,"to":748.5,"location":2,"content":"extracting facts from texts or anything like that,"},{"from":748.5,"to":752.99,"location":2,"content":"there are places we are gonna fail unless we can do coreference resolution."},{"from":752.99,"to":755.02,"location":2,"content":"Because if we're reading a piece of text,"},{"from":755.02,"to":758.64,"location":2,"content":"and it says he was born in 1961, um,"},{"from":758.64,"to":761.61,"location":2,"content":"we can get a fact out or answer a question,"},{"from":761.61,"to":763.85,"location":2,"content":"if we can work out who he was,"},{"from":763.85,"to":769.04,"location":2,"content":"but we probably can't otherwise."},{"from":769.04,"to":773.58,"location":2,"content":"Um, there are, there's sort of another place that where"},{"from":773.58,"to":777.48,"location":2,"content":"this is very useful is in machine translation,"},{"from":777.48,"to":782.13,"location":2,"content":"so that lots of languages drop pronouns."},{"from":782.13,"to":784.93,"location":2,"content":"So you don't have to give explicit pronouns,"},{"from":784.93,"to":789.11,"location":2,"content":"but you need to be able to work out how to fill them in."},{"from":789.11,"to":792.76,"location":2,"content":"And this is making coreference decisions about,"},{"from":792.76,"to":795.08,"location":2,"content":"um, arguments of verbs."},{"from":795.08,"to":798.03,"location":2,"content":"And so here are a couple of examples,"},{"from":798.03,"to":802.54,"location":2,"content":"um, that, um, covering from Spanish to English."},{"from":802.54,"to":807.66,"location":2,"content":"So in Spanish, you can freely drop the subjects of verbs and in these sentences,"},{"from":807.66,"to":809.52,"location":2,"content":"in the because clause,"},{"from":809.52,"to":811.73,"location":2,"content":"there's no overt subject."},{"from":811.73,"to":815.63,"location":2,"content":"And so he gets Alicia likes Juan because he's smart."},{"from":815.63,"to":820.62,"location":2,"content":"And so Google Translate is stuck in a he and that is right."},{"from":820.62,"to":822.6,"location":2,"content":"And to stick in that he,"},{"from":822.6,"to":827.7,"location":2,"content":"it's implicitly making a coreference decision and saying, \"Okay well,"},{"from":827.7,"to":829.61,"location":2,"content":"the subject of this, um,"},{"from":829.61,"to":834.22,"location":2,"content":"adjective smart should be Juan who's male,"},{"from":834.22,"to":836.99,"location":2,"content":"and therefore, I should say he.\""},{"from":836.99,"to":840.06,"location":2,"content":"But, you know, the reality is Google Translate knows"},{"from":840.06,"to":843.62,"location":2,"content":"nothing about coreference and making these coreference decisions."},{"from":843.62,"to":845.28,"location":2,"content":"And as has been um,"},{"from":845.28,"to":850.37,"location":2,"content":"covered quite a bit in the media now and I think came up earlier in an earlier class,"},{"from":850.37,"to":855.24,"location":2,"content":"that, um, Google Translate mainly just defaults to male default."},{"from":855.24,"to":858.08,"location":2,"content":"Um, so if you sort of swap- sweep it, uh,"},{"from":858.08,"to":859.77,"location":2,"content":"if you flip it around and say,"},{"from":859.77,"to":863.37,"location":2,"content":"Juan likes Alicia, it also says because he's smart."},{"from":863.37,"to":868,"location":2,"content":"Uh, whereas probably it should be because she's smart in that case."},{"from":868,"to":871.75,"location":2,"content":"And indeed you notice the bad effects of that everywhere."},{"from":871.75,"to":876.23,"location":2,"content":"So many languages, um, Turkish, Indonesian, um,"},{"from":876.23,"to":878.07,"location":2,"content":"don't actually have gender,"},{"from":878.07,"to":881.91,"location":2,"content":"so that they're much less sexist languages than English,"},{"from":881.91,"to":883.35,"location":2,"content":"French or Germany is."},{"from":883.35,"to":885.5,"location":2,"content":"But what happens, um,"},{"from":885.5,"to":888.06,"location":2,"content":"when you then translate where you just have"},{"from":888.06,"to":892.27,"location":2,"content":"a generic pronoun that means third person pronoun, um,"},{"from":892.27,"to":896.46,"location":2,"content":"that Google Translate is essentially using its language model,"},{"from":896.46,"to":899.05,"location":2,"content":"which means that reconstructs, um,"},{"from":899.05,"to":901.63,"location":2,"content":"the worst of stereotypes of she is a cook,"},{"from":901.63,"to":903.86,"location":2,"content":"and he is an engineer, he is a doctor."},{"from":903.86,"to":907.5,"location":2,"content":"And well, in a connected piece of this course,"},{"from":907.5,"to":910.9,"location":2,"content":"if you'd like Google Translate to be able to do better than that,"},{"from":910.9,"to":914.16,"location":2,"content":"well again, what would be required is that you could actually do"},{"from":914.16,"to":919.75,"location":2,"content":"coreference resolution and track along the actors in the text as you go along."},{"from":919.75,"to":923.97,"location":2,"content":"Um, one final example we haven't really talked about yet,"},{"from":923.97,"to":927.48,"location":2,"content":"but we'll get back to soon now because the class is almost over"},{"from":927.48,"to":931.44,"location":2,"content":"is doing things with dialogue agents or chat systems."},{"from":931.44,"to":936.24,"location":2,"content":"That, as soon as you are going to do anything more than a single turn,"},{"from":936.24,"to":939.84,"location":2,"content":"um, dialog, that you need to start dealing with reference."},{"from":939.84,"to":941.65,"location":2,"content":"So if you've got something like, um,"},{"from":941.65,"to":944.12,"location":2,"content":"booked tickets to see James Bond,"},{"from":944.12,"to":946.2,"location":2,"content":"um, then you want to say something like,"},{"from":946.2,"to":949.05,"location":2,"content":"\"Spectre is playing near you at 2:00 and 3:00 today."},{"from":949.05,"to":951.03,"location":2,"content":"How many tickets would you like?\""},{"from":951.03,"to":954.27,"location":2,"content":"Um, two tickets for the showing at three."},{"from":954.27,"to":956.26,"location":2,"content":"That as shown in the color,"},{"from":956.26,"to":962.34,"location":2,"content":"there are various kinds of reference going on here where things have related reference,"},{"from":962.34,"to":964.68,"location":2,"content":"but it's kind of complicated here."},{"from":964.68,"to":967.47,"location":2,"content":"And this is something that we'll come back to in a moment."},{"from":967.47,"to":973.03,"location":2,"content":"So James Bond and Spectre aren't obviously the same thing,"},{"from":973.03,"to":976.53,"location":2,"content":"but in a context like, um, booking movies,"},{"from":976.53,"to":983.46,"location":2,"content":"they are the same thing because one is the name of a character in a movie series,"},{"from":983.46,"to":988.53,"location":2,"content":"and the other is the name of a movie that's currently showing that belongs to that,"},{"from":988.53,"to":990.96,"location":2,"content":"so that they're sort of associated, um,"},{"from":990.96,"to":993.9,"location":2,"content":"in a sort of subtle way that isn't exact identity,"},{"from":993.9,"to":997.13,"location":2,"content":"but is relevant to a lot of the things that we want to do."},{"from":997.13,"to":999.48,"location":2,"content":"I'll come back to that in a little bit when we"},{"from":999.48,"to":1001.65,"location":2,"content":"talk a bit more about the linguistics of this."},{"from":1001.65,"to":1005.76,"location":2,"content":"Okay. So if we want to do the task of coreference resolution,"},{"from":1005.76,"to":1007.66,"location":2,"content":"there are essentially two steps."},{"from":1007.66,"to":1011.42,"location":2,"content":"So the first step is gee, we want to work out"},{"from":1011.42,"to":1015.84,"location":2,"content":"what mentions there are in the text that we should be doing something with."},{"from":1015.84,"to":1018.64,"location":2,"content":"And this one is effectively pretty easy,"},{"from":1018.64,"to":1021.43,"location":2,"content":"but I'll have just a few slides on that immediately."},{"from":1021.43,"to":1024.84,"location":2,"content":"And then what the bulk of the class is gonna be on is,"},{"from":1024.84,"to":1028.81,"location":2,"content":"um, working out coreference between mentions."},{"from":1028.81,"to":1030.4,"location":2,"content":"And if you think about this,"},{"from":1030.4,"to":1033.36,"location":2,"content":"coreference is essentially a clustering task."},{"from":1033.36,"to":1035.28,"location":2,"content":"Because if you do the first task,"},{"from":1035.28,"to":1038.77,"location":2,"content":"you have a set of mentions and then you want to be saying well,"},{"from":1038.77,"to":1042.71,"location":2,"content":"how can I group these into clusters that have the same reference?"},{"from":1042.71,"to":1045.75,"location":2,"content":"And so that's what we're going to look more at doing."},{"from":1045.75,"to":1048.08,"location":2,"content":"So quickly on mention detection."},{"from":1048.08,"to":1049.73,"location":2,"content":"So, um, for mention,"},{"from":1049.73,"to":1054.09,"location":2,"content":"we wanna find all the spans that are candidates for,"},{"from":1054.09,"to":1056.11,"location":2,"content":"um, referring to some entity."},{"from":1056.11,"to":1058.72,"location":2,"content":"And the answer to what these, um,"},{"from":1058.72,"to":1063.62,"location":2,"content":"candidates are is basically they're all the noun phrases in the text."},{"from":1063.62,"to":1068.13,"location":2,"content":"And so normally people think of there being three types of mentions that we identify."},{"from":1068.13,"to":1069.73,"location":2,"content":"There are pronouns, I,"},{"from":1069.73,"to":1070.99,"location":2,"content":"you, he, she, it,"},{"from":1070.99,"to":1072.79,"location":2,"content":"etc., that are, um,"},{"from":1072.79,"to":1074.61,"location":2,"content":"referring to different entities."},{"from":1074.61,"to":1077.17,"location":2,"content":"They're explicit names of people like that was"},{"from":1077.17,"to":1079.96,"location":2,"content":"that Barack Obama and Hillary Clinton examples."},{"from":1079.96,"to":1082.21,"location":2,"content":"And then many of the tricky examples,"},{"from":1082.21,"to":1084.68,"location":2,"content":"and then when we have common noun phrases"},{"from":1084.68,"to":1087.67,"location":2,"content":"like a dog or the big fluffy cat stuck in the tree."},{"from":1087.67,"to":1091.69,"location":2,"content":"That the big fluffy cat stuck in the tree is a mention."},{"from":1091.69,"to":1094.4,"location":2,"content":"Um, it's actually a complex mention because it"},{"from":1094.4,"to":1097.97,"location":2,"content":"also has embedded inside it other mentions."},{"from":1097.97,"to":1102.5,"location":2,"content":"Um, so the tree is also a mention."},{"from":1102.5,"to":1106.56,"location":2,"content":"Okay. So how can we detect mentions?"},{"from":1106.56,"to":1108.18,"location":2,"content":"Well, one answer is to say,"},{"from":1108.18,"to":1110.06,"location":2,"content":"well we've looked at, um,"},{"from":1110.06,"to":1112.74,"location":2,"content":"various other NLP systems on and off."},{"from":1112.74,"to":1119.25,"location":2,"content":"And we can just use those NLP systems as preprocessing systems to find mentions."},{"from":1119.25,"to":1123.9,"location":2,"content":"So for pronouns, they're part of speech taggers that say what's a noun,"},{"from":1123.9,"to":1125.47,"location":2,"content":"or a verb, or a pronoun,"},{"from":1125.47,"to":1128.91,"location":2,"content":"and so we can run those and find all the pronouns and we're done."},{"from":1128.91,"to":1132.18,"location":2,"content":"From- for, um, the names of things like Barack Obama."},{"from":1132.18,"to":1134.97,"location":2,"content":"We've talked a couple of times about named entity recognizers,"},{"from":1134.97,"to":1137.73,"location":2,"content":"so we can run those and find all the named entities."},{"from":1137.73,"to":1140.47,"location":2,"content":"Um, then for common noun phrases,"},{"from":1140.47,"to":1143.4,"location":2,"content":"that's sort of where we need parsers to find"},{"from":1143.4,"to":1146.92,"location":2,"content":"the structure of the sentence and find where the noun phrases are."},{"from":1146.92,"to":1149.74,"location":2,"content":"And we have talked about dependency parsers and well,"},{"from":1149.74,"to":1154.38,"location":2,"content":"one choice is you can use a dependency parser to find the sort of nominal arguments,"},{"from":1154.38,"to":1155.67,"location":2,"content":"and work with them."},{"from":1155.67,"to":1159.18,"location":2,"content":"That's sort of actually a little bit subtler than just sort of wanting to pick"},{"from":1159.18,"to":1162.53,"location":2,"content":"out spans that refer to common noun phrases."},{"from":1162.53,"to":1165.86,"location":2,"content":"So the other notion of parsing which we come back to,"},{"from":1165.86,"to":1168.15,"location":2,"content":"um, next week is constituency parsing."},{"from":1168.15,"to":1170.28,"location":2,"content":"In some sense, constituency parsers are"},{"from":1170.28,"to":1174.46,"location":2,"content":"the simplest way to find mentions for this process."},{"from":1174.46,"to":1179.31,"location":2,"content":"Um, most of it seems and is easy,"},{"from":1179.31,"to":1184.2,"location":2,"content":"um, there are sort of tricky cases as to what counts as a mention or not."},{"from":1184.2,"to":1187.79,"location":2,"content":"So, um, if it's kind of it is sunny,"},{"from":1187.79,"to":1189.97,"location":2,"content":"I mean, is it a mention of something?"},{"from":1189.97,"to":1191.88,"location":2,"content":"It's sort of seems like it's not really,"},{"from":1191.88,"to":1195.87,"location":2,"content":"it's just it seems like it's an it that you stick at the start of the sentence,"},{"from":1195.87,"to":1197.34,"location":2,"content":"um, that doesn't mean anything."},{"from":1197.34,"to":1199.21,"location":2,"content":"So that's maybe not a mention."},{"from":1199.21,"to":1200.73,"location":2,"content":"Um, every student."},{"from":1200.73,"to":1202.69,"location":2,"content":"Is every student a mention?"},{"from":1202.69,"to":1207.81,"location":2,"content":"I mean, it's certainly, at best it's some kind of collective,"},{"from":1207.81,"to":1211.96,"location":2,"content":"um, but it's not sort of a very clear concrete reference, um."},{"from":1211.96,"to":1215.52,"location":2,"content":"That goes further, if I sort of use different quantifiers,"},{"from":1215.52,"to":1217.92,"location":2,"content":"so if it was like, every and no are called quantifiers."},{"from":1217.92,"to":1221.07,"location":2,"content":"I mean no student definitely doesn't have reference,"},{"from":1221.07,"to":1223.56,"location":2,"content":"because it's not pointing at anything, right?"},{"from":1223.56,"to":1225.99,"location":2,"content":"It's asserting a claim of nonexistence."},{"from":1225.99,"to":1228.02,"location":2,"content":"So that there's definitely, um,"},{"from":1228.02,"to":1231.09,"location":2,"content":"no- it isn't a mention of anything."},{"from":1231.09,"to":1234.42,"location":2,"content":"Um, yeah, the best donut in the world."},{"from":1234.42,"to":1237.67,"location":2,"content":"Um, does that have reference?"},{"from":1237.67,"to":1240.13,"location":2,"content":"Um, that's unclear."},{"from":1240.13,"to":1244.46,"location":2,"content":"This is the kind of thing that actual philosophers of language debate over, right?"},{"from":1244.46,"to":1248.32,"location":2,"content":"So if there was agreement on what the best donut in the world is,"},{"from":1248.32,"to":1250.54,"location":2,"content":"then maybe it has reference, um,"},{"from":1250.54,"to":1252.82,"location":2,"content":"but I can say sentences like,"},{"from":1252.82,"to":1256.13,"location":2,"content":"I'm searching everywhere to find the best donut in the world."},{"from":1256.13,"to":1257.96,"location":2,"content":"And then in that sentence,"},{"from":1257.96,"to":1259.34,"location":2,"content":"it doesn't have any reference, right?"},{"from":1259.34,"to":1263.97,"location":2,"content":"It's sort of an intentional description of what I'm hoping to find,"},{"from":1263.97,"to":1266.99,"location":2,"content":"that there's no concrete thing it refers to."},{"from":1266.99,"to":1271.61,"location":2,"content":"Um, things like quantities, 100 miles."},{"from":1271.61,"to":1274.17,"location":2,"content":"That sort of behaves like a noun phrase,"},{"from":1274.17,"to":1277.99,"location":2,"content":"but it is in- it's sort of really a quantity that doesn't really have reference."},{"from":1277.99,"to":1283.2,"location":2,"content":"Um, and so then there's the question of how can you deal with this stuff?"},{"from":1283.2,"to":1287.36,"location":2,"content":"Um, well, um, our tool whenever we want to deal with stuff,"},{"from":1287.36,"to":1290.27,"location":2,"content":"is we train classifiers,"},{"from":1290.27,"to":1294.15,"location":2,"content":"as in they pick out things that are mentioned and things that aren't."},{"from":1294.15,"to":1298.8,"location":2,"content":"And so that's something that you could do is write a classifier that filters out,"},{"from":1298.8,"to":1302.94,"location":2,"content":"um, these spurious things that you want to say aren't really mentions."},{"from":1302.94,"to":1304.98,"location":2,"content":"And people absolutely have done that."},{"from":1304.98,"to":1307.65,"location":2,"content":"But commonly actually people skip that step,"},{"from":1307.65,"to":1314.3,"location":2,"content":"and you just sort of instead have your mention detector find all candidate mentions."},{"from":1314.3,"to":1317.61,"location":2,"content":"Because it turns out that that tends to work pretty well."},{"from":1317.61,"to":1321.51,"location":2,"content":"Because after we found all of our mentions, um,"},{"from":1321.51,"to":1325.82,"location":2,"content":"we're then going to be doing this clustering process to find coreferent mentions."},{"from":1325.82,"to":1328.44,"location":2,"content":"And if there are just a few stray mentions like"},{"from":1328.44,"to":1332.65,"location":2,"content":"no student and we don't cluster them wrongly with anything else,"},{"from":1332.65,"to":1338.49,"location":2,"content":"it kind of doesn't do any harm because we are mainly involved in this clustering process."},{"from":1338.49,"to":1343.44,"location":2,"content":"Okay. Um, something you might be wondering is,"},{"from":1343.44,"to":1345.09,"location":2,"content":"well I've sort of implied now,"},{"from":1345.09,"to":1346.61,"location":2,"content":"we have a pipeline."},{"from":1346.61,"to":1349.77,"location":2,"content":"I'm saying we're going to run a part of speech tagger,"},{"from":1349.77,"to":1351.81,"location":2,"content":"and we're going to run a named entity recognizer,"},{"from":1351.81,"to":1353.27,"location":2,"content":"and we're going to run a parser."},{"from":1353.27,"to":1355.45,"location":2,"content":"And we're going to run a, um,"},{"from":1355.45,"to":1358.18,"location":2,"content":"a named mention detector."},{"from":1358.18,"to":1361.53,"location":2,"content":"And then eventually, we're going to run this coref clustering system,"},{"from":1361.53,"to":1364.29,"location":2,"content":"so we have a sort of a five-step pipeline."},{"from":1364.29,"to":1370.95,"location":2,"content":"Um, is that the only way you can do, um, coreference resolution?"},{"from":1370.95,"to":1373.02,"location":2,"content":"And the traditional answer was yup,"},{"from":1373.02,"to":1375.03,"location":2,"content":"that's the way you did coreference resolution."},{"from":1375.03,"to":1379.28,"location":2,"content":"That essentially, all systems for coreference resolution,"},{"from":1379.28,"to":1385.8,"location":2,"content":"until approximately 2016 where a pipeline that went through about those stages."},{"from":1385.8,"to":1389.73,"location":2,"content":"Um, but just recently and I will dico- cover one such system,"},{"from":1389.73,"to":1392.18,"location":2,"content":"um, later in the class, um,"},{"from":1392.18,"to":1395.57,"location":2,"content":"that people in the neural world have started doing what's been"},{"from":1395.57,"to":1399.56,"location":2,"content":"effective in a lot of places in the neural network world of saying,"},{"from":1399.56,"to":1402.82,"location":2,"content":"can we just build an end-to-end coreference system"},{"from":1402.82,"to":1406.38,"location":2,"content":"that starts with just plain text of a paragraph,"},{"from":1406.38,"to":1413.68,"location":2,"content":"and feeds out coreference clusters without there being any intervening pipeline steps?"},{"from":1413.68,"to":1416.63,"location":2,"content":"And I'll show you a bit more about how that works."},{"from":1416.63,"to":1420.09,"location":2,"content":"Um, but before we get into systems,"},{"from":1420.09,"to":1425.22,"location":2,"content":"I just wanted to say a little bit more about the linguistics of coreference."},{"from":1425.22,"to":1429.57,"location":2,"content":"Um, there's actually quite a lot of interesting stuff here,"},{"from":1429.57,"to":1432.36,"location":2,"content":"and to a fair degree,"},{"from":1432.36,"to":1435.81,"location":2,"content":"it's not actually stuff that's been thought about"},{"from":1435.81,"to":1439.07,"location":2,"content":"very much by people who build NLP systems, right?"},{"from":1439.07,"to":1441.15,"location":2,"content":"I already mentioned, um,"},{"from":1441.15,"to":1443.51,"location":2,"content":"from the Shruthi Rao story, um,"},{"from":1443.51,"to":1446.25,"location":2,"content":"the example of split antecedents, right?"},{"from":1446.25,"to":1449.97,"location":2,"content":"That that's just a clear linguistic phenomenon that happens,"},{"from":1449.97,"to":1452.16,"location":2,"content":"and it's not even incredibly rare, right?"},{"from":1452.16,"to":1454.1,"location":2,"content":"Um, that, you know, um,"},{"from":1454.1,"to":1459.59,"location":2,"content":"people build these simple machine learning models that just can't deal with that."},{"from":1459.59,"to":1462.63,"location":2,"content":"And there's really quite a bit more structure"},{"from":1462.63,"to":1465.75,"location":2,"content":"to what happens in the linguistics of coreference,"},{"from":1465.75,"to":1470.28,"location":2,"content":"it isn't really being exploited in most of the systems people bui- build."},{"from":1470.28,"to":1473.14,"location":2,"content":"So I just wanted to show people a bit more of that."},{"from":1473.14,"to":1478.2,"location":2,"content":"And essentially, to sort of understanding, um,"},{"from":1478.2,"to":1481.1,"location":2,"content":"more about how people see things linguistically,"},{"from":1481.1,"to":1486.94,"location":2,"content":"there are two concepts that are related and commonly confused,"},{"from":1486.94,"to":1488.37,"location":2,"content":"that are really different."},{"from":1488.37,"to":1490.63,"location":2,"content":"So one is coreference."},{"from":1490.63,"to":1494.7,"location":2,"content":"So we say that things are coreferent when there are"},{"from":1494.7,"to":1499.23,"location":2,"content":"two mentions and they refer to the same entity in the world."},{"from":1499.23,"to":1500.97,"location":2,"content":"So if it's sort of,"},{"from":1500.97,"to":1505.14,"location":2,"content":"um, Donald Trump and the current president, right?"},{"from":1505.14,"to":1509.2,"location":2,"content":"They're two mentions and they refer to the same person in the world."},{"from":1509.2,"to":1512.19,"location":2,"content":"And so that is a relationship of coreference."},{"from":1512.19,"to":1516.35,"location":2,"content":"Um, and that's then contrasted, um, with anaphora."},{"from":1516.35,"to":1524.6,"location":2,"content":"And so the idea of anaphora is some terms in text don't have independent reference,"},{"from":1524.6,"to":1530.84,"location":2,"content":"and you work out their reference by relating them back to another thing in the text."},{"from":1530.84,"to":1532.8,"location":2,"content":"So if we have the sentence,"},{"from":1532.8,"to":1535.5,"location":2,"content":"Barack Obama said he would sign the bill."},{"from":1535.5,"to":1537.22,"location":2,"content":"He is an anaphor."},{"from":1537.22,"to":1539.43,"location":2,"content":"And if I just say, he,"},{"from":1539.43,"to":1541.9,"location":2,"content":"what does he refer to in the abstract?"},{"from":1541.9,"to":1545.78,"location":2,"content":"Well, you know, apart from saying something male, right?"},{"from":1545.78,"to":1547.05,"location":2,"content":"You've got no idea, right?"},{"from":1547.05,"to":1550.11,"location":2,"content":"Because you can't work out what he means just by knowing he."},{"from":1550.11,"to":1554.37,"location":2,"content":"You have to be looking at a text and interpreting it relative to the text."},{"from":1554.37,"to":1556.52,"location":2,"content":"And then if you're interpreting it,"},{"from":1556.52,"to":1558.8,"location":2,"content":"um, relative to the text,"},{"from":1558.8,"to":1560.37,"location":2,"content":"you're then in this situation of,"},{"from":1560.37,"to":1563.88,"location":2,"content":"okay I see, this refers back to Barack Obama."},{"from":1563.88,"to":1567.6,"location":2,"content":"So he is another mention of Barack Obama,"},{"from":1567.6,"to":1570.91,"location":2,"content":"then- and this then is this concept of anaphora."},{"from":1570.91,"to":1573.62,"location":2,"content":"So the picture we have is sort of like this,"},{"from":1573.62,"to":1577.63,"location":2,"content":"that you can either have these independent mentions,"},{"from":1577.63,"to":1579.69,"location":2,"content":"which do refer, um,"},{"from":1579.69,"to":1581.31,"location":2,"content":"to the same thing in the world."},{"from":1581.31,"to":1582.77,"location":2,"content":"They're coreferent."},{"from":1582.77,"to":1584.58,"location":2,"content":"But in many cases,"},{"from":1584.58,"to":1587.97,"location":2,"content":"such as when they're full mentions like President Obama,"},{"from":1587.97,"to":1591.99,"location":2,"content":"versus Barack Obama, they don't have any textual relationship."},{"from":1591.99,"to":1595.45,"location":2,"content":"It's just they happen to refer to the same thing in the world."},{"from":1595.45,"to":1600.83,"location":2,"content":"And that then contrast with cases like Barack Obama said he would do something,"},{"from":1600.83,"to":1605.27,"location":2,"content":"where the he has a textual relationship back to Barack Obama."},{"from":1605.27,"to":1607.53,"location":2,"content":"And that's an example of anaphora."},{"from":1607.53,"to":1614.98,"location":2,"content":"Um, this might up until now feel like an almost meaningless distinction."},{"from":1614.98,"to":1619.42,"location":2,"content":"But something that maybe gives you more of a sense that there's something useful here is,"},{"from":1619.42,"to":1624.42,"location":2,"content":"um, these textual relationships exist even when there isn't coreference."},{"from":1624.42,"to":1626.79,"location":2,"content":"So we sort of mentioned before,"},{"from":1626.79,"to":1629.43,"location":2,"content":"these cases like no dancer, right?"},{"from":1629.43,"to":1632.61,"location":2,"content":"So no dancer doesn't have reference, right?"},{"from":1632.61,"to":1634.31,"location":2,"content":"It refers to nothing."},{"from":1634.31,"to":1636.81,"location":2,"content":"Um, but if you have a sentence like,"},{"from":1636.81,"to":1642.62,"location":2,"content":"\"no dancer twisted her knee,\" well we have an anaphor here."},{"from":1642.62,"to":1646.26,"location":2,"content":"And that anaphor is referring back to \"no"},{"from":1646.26,"to":1650.52,"location":2,"content":"dancer\" despite the fact that \"no dancer\" doesn't have reference."},{"from":1650.52,"to":1654.15,"location":2,"content":"So we can still have the anaphoric textual relationship."},{"from":1654.15,"to":1656.07,"location":2,"content":"And indeed, you know,"},{"from":1656.07,"to":1659.28,"location":2,"content":"her knee is then a part of her."},{"from":1659.28,"to":1662.04,"location":2,"content":"And so these are the sort of part relationships again."},{"from":1662.04,"to":1665.37,"location":2,"content":"But her knee, in a sense that I'll just come back to,"},{"from":1665.37,"to":1671.33,"location":2,"content":"is also an anaphor which is interpreted with respect, um, to the dancer."},{"from":1671.33,"to":1674.37,"location":2,"content":"So we have two anaphoric relationships here,"},{"from":1674.37,"to":1677.25,"location":2,"content":"even though we have no reference."},{"from":1677.25,"to":1680.87,"location":2,"content":"There's another interesting case of"},{"from":1680.87,"to":1684.8,"location":2,"content":"anaphoric relationships which aren't the same as reference,"},{"from":1684.8,"to":1688.38,"location":2,"content":"which is you could have looser forms of anaphoric relationships."},{"from":1688.38,"to":1690.81,"location":2,"content":"So you get lots of sentences like this."},{"from":1690.81,"to":1693.13,"location":2,"content":"\"We went to see a concert last night,"},{"from":1693.13,"to":1695.25,"location":2,"content":"the tickets were really expensive.\""},{"from":1695.25,"to":1698.92,"location":2,"content":"So we have this mentioned here of the tickets."},{"from":1698.92,"to":1702.46,"location":2,"content":"Um, but really to interpret the tickets,"},{"from":1702.46,"to":1706.99,"location":2,"content":"we have to interpret them with respect to this,"},{"from":1706.99,"to":1708.73,"location":2,"content":"um, mention back here,"},{"from":1708.73,"to":1711.46,"location":2,"content":"a concept, because really what this is saying,"},{"from":1711.46,"to":1715.1,"location":2,"content":"the tickets for the concert were really expensive."},{"from":1715.1,"to":1719.2,"location":2,"content":"So this is also referred to as an anaphoric relationship,"},{"from":1719.2,"to":1722.19,"location":2,"content":"where the meaning of the tickets has to be interpreted"},{"from":1722.19,"to":1726.6,"location":2,"content":"textually based on another, um, noun phrase."},{"from":1726.6,"to":1729.69,"location":2,"content":"But it's not a coreference relationship that"},{"from":1729.69,"to":1733.34,"location":2,"content":"the concert and the tickets are clearly two different entities."},{"from":1733.34,"to":1736.98,"location":2,"content":"So these kinda looser cases are referred to as bridging anaphora,"},{"from":1736.98,"to":1740.81,"location":2,"content":"because you sort of have to supply for yourself the bridge,"},{"from":1740.81,"to":1747.11,"location":2,"content":"the relation that connects together the antecedent and the anaphor."},{"from":1747.11,"to":1750.78,"location":2,"content":"Okay. So that's how- we then have these pictures,"},{"from":1750.78,"to":1754.89,"location":2,"content":"that we have this sort of not in- not complete crossovers"},{"from":1754.89,"to":1759.51,"location":2,"content":"between coreference and anaphora that we've sort of talked about."},{"from":1759.51,"to":1764.2,"location":2,"content":"Um, I have one other note on anaphora. Um,"},{"from":1764.2,"to":1768.83,"location":2,"content":"Who- has anyone here ever done any Ancient Greek?"},{"from":1768.83,"to":1774.02,"location":2,"content":"Any Ancient Greek? [LAUGHTER] Yes."},{"from":1774.02,"to":1776.04,"location":2,"content":"Okay. Um, so, um,"},{"from":1776.04,"to":1780.74,"location":2,"content":"from- from the origins of the words anaphora,"},{"from":1780.74,"to":1787.13,"location":2,"content":"anaphora is meant to be that you're finding your textual reference before you."},{"from":1787.13,"to":1793.3,"location":2,"content":"Um, and so there's actually a- a complementary, um,"},{"from":1793.3,"to":1796.94,"location":2,"content":"term of art which is referred to as"},{"from":1796.94,"to":1802.14,"location":2,"content":"cataphora where you're finding your reference after you."},{"from":1802.14,"to":1805.7,"location":2,"content":"Um, so here is a beautiful example of cataphora."},{"from":1805.7,"to":1807.38,"location":2,"content":"So this is from Oscar Wilde's,"},{"from":1807.38,"to":1809.33,"location":2,"content":"The Picture of Dorian Gray."},{"from":1809.33,"to":1814.34,"location":2,"content":"\"From the corner of the divan of Persian saddle-bags on which he was lying,"},{"from":1814.34,"to":1816.62,"location":2,"content":"smoking, as was his custom,"},{"from":1816.62,"to":1820.69,"location":2,"content":"innumerable cigarettes, Lord Henry Wotton could just catch"},{"from":1820.69,"to":1825.47,"location":2,"content":"the gleam of the honey-sweet and honey-colored blossoms of a laburnum.\""},{"from":1825.47,"to":1828.87,"location":2,"content":"Um, right. So here we have this, um, mentioned,"},{"from":1828.87,"to":1833.06,"location":2,"content":"Lord Henry Wotton and there are two anaphors,"},{"from":1833.06,"to":1835.94,"location":2,"content":"um, that refer to Lord Henry Wotton."},{"from":1835.94,"to":1839.3,"location":2,"content":"Um, he and his,"},{"from":1839.3,"to":1841.73,"location":2,"content":"and that they both come before,"},{"from":1841.73,"to":1843.99,"location":2,"content":"um, Lord Henry Wotton."},{"from":1843.99,"to":1847.01,"location":2,"content":"And so these are referred to, um,"},{"from":1847.01,"to":1853.67,"location":2,"content":"as instances of cataphora among a certain kind of classical scholar."},{"from":1853.67,"to":1856.46,"location":2,"content":"Um, and in case you don't know what a laburnum is,"},{"from":1856.46,"to":1858.68,"location":2,"content":"um, this is a laburnum."},{"from":1858.68,"to":1861.89,"location":2,"content":"[LAUGHTER] Right. But, yeah,"},{"from":1861.89,"to":1863.37,"location":2,"content":"so thi- this is cataphora."},{"from":1863.37,"to":1865.94,"location":2,"content":"Now- now there are two sad things to say."},{"from":1865.94,"to":1869.63,"location":2,"content":"Um, the first sad thing is in modern linguistics,"},{"from":1869.63,"to":1872.38,"location":2,"content":"the term cataphora is completely disused."},{"from":1872.38,"to":1878.18,"location":2,"content":"And we mean- we just used the word um, anaphors everywhere as meaning"},{"from":1878.18,"to":1881.3,"location":2,"content":"a word that gets referenced from some other mention in"},{"from":1881.3,"to":1884.53,"location":2,"content":"the text and it doesn't matter what side it's on."},{"from":1884.53,"to":1888.56,"location":2,"content":"Um, so, um, that we go downhill one stage to"},{"from":1888.56,"to":1894.06,"location":2,"content":"linguistics but then we get to NLP and we go downhill a second stage."},{"from":1894.06,"to":1898.16,"location":2,"content":"Because what you'll see is that in general,"},{"from":1898.16,"to":1900.48,"location":2,"content":"the systems that people are building for,"},{"from":1900.48,"to":1907.36,"location":2,"content":"um, reference resolution, they don't make any distinction of direction at all."},{"from":1907.36,"to":1909.15,"location":2,"content":"That once you find a mention,"},{"from":1909.15,"to":1912.23,"location":2,"content":"you're always looking backwards for its reference."},{"from":1912.23,"to":1914.88,"location":2,"content":"Um, and you've got no idea that,"},{"from":1914.88,"to":1917.63,"location":2,"content":"well, maybe sometimes you could look forwards."},{"from":1917.63,"to":1919.28,"location":2,"content":"So effectively, what it means,"},{"from":1919.28,"to":1921.37,"location":2,"content":"that the systems end up doing is saying,"},{"from":1921.37,"to":1922.93,"location":2,"content":"well, there's a he here,"},{"from":1922.93,"to":1925.95,"location":2,"content":"there are various other things, there's a his, etc.,"},{"from":1925.95,"to":1929.65,"location":2,"content":"and you'll eventually get to Lord Henry Wotton and you'll be able to"},{"from":1929.65,"to":1933.99,"location":2,"content":"be trying to find its reference by looking backwards,"},{"from":1933.99,"to":1937.76,"location":2,"content":"even though that's sort of ill-formed from any kind of linguistic sense"},{"from":1937.76,"to":1942.31,"location":2,"content":"whereas really he and his that should have been looking for their reference forward."},{"from":1942.31,"to":1949.84,"location":2,"content":"Okay. Um, is everyone good up to there, any questions?"},{"from":1949.84,"to":1954.11,"location":2,"content":"Okay. We'll move ahead and, um,"},{"from":1954.11,"to":1958.61,"location":2,"content":"try and move on to kinds of coreference, um, models."},{"from":1958.61,"to":1961.71,"location":2,"content":"So I wanted to, um, tell you, um,"},{"from":1961.71,"to":1965.3,"location":2,"content":"as much as I can and I have 45 minutes, um,"},{"from":1965.3,"to":1969.06,"location":2,"content":"left about, so the kinda models people build with coreference."},{"from":1969.06,"to":1973.67,"location":2,"content":"And I hope to mention quickly four different ways that people have looked at coreference."},{"from":1973.67,"to":1977.8,"location":2,"content":"I wanna tell you a teeny bit about classical rule-based coreference."},{"from":1977.8,"to":1982.01,"location":2,"content":"Um, then, um, mention- mention pair coreference."},{"from":1982.01,"to":1984.93,"location":2,"content":"Spend the most time on mention ranking systems which have"},{"from":1984.93,"to":1987.99,"location":2,"content":"tended to be the easiest simple systems."},{"from":1987.99,"to":1989.58,"location":2,"content":"And then just say a little bit about"},{"from":1989.58,"to":1992.78,"location":2,"content":"clustering systems which should be the right way to do"},{"from":1992.78,"to":1998.15,"location":2,"content":"it but in practice has been a way that's been hard to get the best performance from."},{"from":1998.15,"to":2000.9,"location":2,"content":"Okay. So here's a bit of history."},{"from":2000.9,"to":2002.98,"location":2,"content":"Um, this guy here is Jerry Hobbs."},{"from":2002.98,"to":2008.32,"location":2,"content":"He just had his retirement party from University of Southern California last month."},{"from":2008.32,"to":2009.67,"location":2,"content":"Um, so Jerry Hobbs,"},{"from":2009.67,"to":2011.82,"location":2,"content":"way back when, um,"},{"from":2011.82,"to":2013.66,"location":2,"content":"wrote a famous paper,"},{"from":2013.66,"to":2017.9,"location":2,"content":"it was in 1976 on coreference resolution."},{"from":2017.9,"to":2021.52,"location":2,"content":"And in that paper, um, he proposed,"},{"from":2021.52,"to":2025.6,"location":2,"content":"um, what's normally now referred to as the Hobbs Algorithm."},{"from":2025.6,"to":2027.89,"location":2,"content":"But actually, um, in his paper,"},{"from":2027.89,"to":2031.18,"location":2,"content":"he refers to it as a naive algorithm."},{"from":2031.18,"to":2034.68,"location":2,"content":"Um, and I'll come back to that distinction in just a moment."},{"from":2034.68,"to":2037.63,"location":2,"content":"Um, but what the Hobbs algorithm was,"},{"from":2037.63,"to":2041.98,"location":2,"content":"is if you have a sentence- so actually I should say this,"},{"from":2041.98,"to":2045.43,"location":2,"content":"this algorithm is just for finding the reference of pronouns."},{"from":2045.43,"to":2048.5,"location":2,"content":"So one can extend out to other cases but the part I'm gonna show"},{"from":2048.5,"to":2051.76,"location":2,"content":"you is just the part for doing the reference of pronouns."},{"from":2051.76,"to":2052.93,"location":2,"content":"So when you find out,"},{"from":2052.93,"to":2057.93,"location":2,"content":"find a pronoun and you wanna say what is it, um, coreferent with?"},{"from":2057.93,"to":2062.7,"location":2,"content":"What you're going to do is run this mechanical algorithm"},{"from":2062.7,"to":2067.95,"location":2,"content":"that's looking at a parse of a sentence and is working out what to do with it."},{"from":2067.95,"to":2070.82,"location":2,"content":"Begin at the NP immediately dominating the pronoun,"},{"from":2070.82,"to":2075.32,"location":2,"content":"go up the trees or the first NP or S. Call this X and the path p,"},{"from":2075.32,"to":2077.76,"location":2,"content":"traverse along, ah, it goes on and on."},{"from":2077.76,"to":2079.01,"location":2,"content":"Um, there's more of it."},{"from":2079.01,"to":2080.11,"location":2,"content":"That was only the beginning of it."},{"from":2080.11,"to":2081.47,"location":2,"content":"There are a lot more stages."},{"from":2081.47,"to":2083.23,"location":2,"content":"Um, but, you know,"},{"from":2083.23,"to":2086.89,"location":2,"content":"I'm not- I don't really wanna go into the details of this."},{"from":2086.89,"to":2090.07,"location":2,"content":"Um, but, you know, to try and explain the flavor of it,"},{"from":2090.07,"to":2091.66,"location":2,"content":"here's a piece of text."},{"from":2091.66,"to":2093.98,"location":2,"content":"\"Niall Ferguson is prolific,"},{"from":2093.98,"to":2096.22,"location":2,"content":"well-paid, and a snappy dresser."},{"from":2096.22,"to":2098.39,"location":2,"content":"Stephen Moss hated him.\""},{"from":2098.39,"to":2102.85,"location":2,"content":"Um, and so if you can remember any of the steps of that algorithm,"},{"from":2102.85,"to":2106.28,"location":2,"content":"here's our, um, pronoun him."},{"from":2106.28,"to":2111.53,"location":2,"content":"Um, and then, what it said to do was begin at the NP,"},{"from":2111.53,"to":2114.21,"location":2,"content":"the noun phrase above the pronoun."},{"from":2114.21,"to":2119.01,"location":2,"content":"And then it said, to go up to the first noun phrase or S above that,"},{"from":2119.01,"to":2121.37,"location":2,"content":"um, here is the S above that."},{"from":2121.37,"to":2124.76,"location":2,"content":"Um, and then what you're meant to do is, from there,"},{"from":2124.76,"to":2130.18,"location":2,"content":"you're meant to go left to right through stuff that came before that."},{"from":2130.18,"to":2132.98,"location":2,"content":"So there's a lot of cleverness in this handwritten algorithm."},{"from":2132.98,"to":2136.44,"location":2,"content":"You know, this is in the space of clever handwritten algorithms."},{"from":2136.44,"to":2140.14,"location":2,"content":"And so what this is reflecting is that you might just think you"},{"from":2140.14,"to":2144.05,"location":2,"content":"should go to the closest thing to find reference,"},{"from":2144.05,"to":2148.74,"location":2,"content":"but actually if you have reference within the same sentence,"},{"from":2148.74,"to":2151.73,"location":2,"content":"it's much more common for the sort of"},{"from":2151.73,"to":2156.09,"location":2,"content":"highest syntactic roles to be what you're coreferent with."},{"from":2156.09,"to":2159.78,"location":2,"content":"So you're more likely to be coreferent with a subject than an object,"},{"from":2159.78,"to":2163.76,"location":2,"content":"and you're more likely to be coreferent with an object than something like"},{"from":2163.76,"to":2168.93,"location":2,"content":"a noun phrase and that's inside a prepositional phrase that follows the object."},{"from":2168.93,"to":2171.86,"location":2,"content":"So we're gonna start from the left here and we're gonna"},{"from":2171.86,"to":2174.71,"location":2,"content":"say here's a noun phrase, Stephen Moss."},{"from":2174.71,"to":2176.52,"location":2,"content":"That's the first one we come to."},{"from":2176.52,"to":2180.26,"location":2,"content":"And then there's this clever bit of text that says,"},{"from":2180.26,"to":2184.82,"location":2,"content":"um, traversal branches, um, below X,"},{"from":2184.82,"to":2187.08,"location":2,"content":"that are to the left- left to right,"},{"from":2187.08,"to":2191.04,"location":2,"content":"propose as antecedent and noun phrase, um,"},{"from":2191.04,"to":2197.22,"location":2,"content":"that has a noun phrase or sentence between it's an ec- in the S. So it was saying,"},{"from":2197.22,"to":2198.99,"location":2,"content":"this will be a candidate,"},{"from":2198.99,"to":2200.52,"location":2,"content":"if and only if,"},{"from":2200.52,"to":2204.34,"location":2,"content":"there's some other noun phrase or S in-between."},{"from":2204.34,"to":2208.87,"location":2,"content":"Um, and so what that's saying is Stephen Moss hated him."},{"from":2208.87,"to":2212.21,"location":2,"content":"It- this him cannot refer back to"},{"from":2212.21,"to":2215.83,"location":2,"content":"Stephen Moss and that sort of pretty much a fact of English syntax."},{"from":2215.83,"to":2220.09,"location":2,"content":"But what it's wanting to do is distinguish between,"},{"from":2220.09,"to":2222.94,"location":2,"content":"another thing that we could have had here was"},{"from":2222.94,"to":2229.1,"location":2,"content":"a noun phrase that had another possessive noun phrase inside it."},{"from":2229.1,"to":2237.59,"location":2,"content":"Um, so if we had something like Stephen Moss's mother hated him, right?"},{"from":2237.59,"to":2242.62,"location":2,"content":"Then the Stephen mother- Moss's mother hated him, then that would,"},{"from":2242.62,"to":2248.05,"location":2,"content":"in that case, it would be perfectly okay for him to be coreferent with Stephen Moss."},{"from":2248.05,"to":2251.12,"location":2,"content":"And the algorithm allows that because relative to"},{"from":2251.12,"to":2255.76,"location":2,"content":"this noun phrase is another noun phrase above it and between."},{"from":2255.76,"to":2258.2,"location":2,"content":"Okay. So that didn't work, um,"},{"from":2258.2,"to":2260.74,"location":2,"content":"as an antece- as an antecedent,"},{"from":2260.74,"to":2263.41,"location":2,"content":"so then we go onto the next step of the algorithm."},{"from":2263.41,"to":2264.92,"location":2,"content":"And then, the next step says,"},{"from":2264.92,"to":2269.09,"location":2,"content":"we should proceed backwards through preceding sentences,"},{"from":2269.09,"to":2270.59,"location":2,"content":"um, right to left."},{"from":2270.59,"to":2275.09,"location":2,"content":"And so that captures an important heuristic that proximity is actually"},{"from":2275.09,"to":2278.09,"location":2,"content":"a good heuristic to find coreference"},{"from":2278.09,"to":2282.11,"location":2,"content":"because coreference for pronouns is usually close by overall."},{"from":2282.11,"to":2285.05,"location":2,"content":"And so we go to the first sentence back."},{"from":2285.05,"to":2287.89,"location":2,"content":"And then in this sentence, again,"},{"from":2287.89,"to":2289.61,"location":2,"content":"we go into within the sentence,"},{"from":2289.61,"to":2293.34,"location":2,"content":"go left to right because there's the same kind of subject prominence role."},{"from":2293.34,"to":2295.68,"location":2,"content":"And so we're gonna start in this sentence,"},{"from":2295.68,"to":2296.99,"location":2,"content":"and we're gonna say okay,"},{"from":2296.99,"to":2298.71,"location":2,"content":"here's a noun phrase."},{"from":2298.71,"to":2301.32,"location":2,"content":"And now because we're in a different sentence,"},{"from":2301.32,"to":2303.36,"location":2,"content":"there's nothing wrong with this one."},{"from":2303.36,"to":2304.72,"location":2,"content":"So we say, aha,"},{"from":2304.72,"to":2307.34,"location":2,"content":"we have a candidate, Niall Ferguson,"},{"from":2307.34,"to":2312.39,"location":2,"content":"um, is a possible antecedent and it's the first one we found."},{"from":2312.39,"to":2315.7,"location":2,"content":"And therefore, we say that him refers back to Niall Ferguson."},{"from":2315.7,"to":2318.64,"location":2,"content":"And this algorithm actually gives the right answer,"},{"from":2318.64,"to":2320.75,"location":2,"content":"if you could follow along all of that."},{"from":2320.75,"to":2323.32,"location":2,"content":"Um, though that sounds like, um,"},{"from":2323.32,"to":2327.02,"location":2,"content":"horrible handwritten stuff."},{"from":2327.02,"to":2336.81,"location":2,"content":"But, um, so Jerry Hobbs was aware of that this was horrible handwritten stuff,"},{"from":2336.81,"to":2341.8,"location":2,"content":"but he was interested in this algorithm for a couple of reasons."},{"from":2341.8,"to":2344.97,"location":2,"content":"I mean, reason one is, you know,"},{"from":2344.97,"to":2349.98,"location":2,"content":"this is actually one of the first places in natural language processing,"},{"from":2349.98,"to":2352.74,"location":2,"content":"that someone produced the baseline, right."},{"from":2352.74,"to":2355.62,"location":2,"content":"In for final projects and elsewhere,"},{"from":2355.62,"to":2357.75,"location":2,"content":"um, and stuff we gave you, right,"},{"from":2357.75,"to":2361.26,"location":2,"content":"it's seen now in NLP and other areas,"},{"from":2361.26,"to":2362.49,"location":2,"content":"that anything you are doing,"},{"from":2362.49,"to":2364.62,"location":2,"content":"the first thing you should do is have a baseline,"},{"from":2364.62,"to":2367.44,"location":2,"content":"a simple system and see how well it works."},{"from":2367.44,"to":2371.95,"location":2,"content":"And this was his simple rule-based system for doing coreference,"},{"from":2371.95,"to":2376.57,"location":2,"content":"um, and he wanted to observe that actually this baseline was pretty good."},{"from":2376.57,"to":2380.93,"location":2,"content":"It actually gave the right answer a lot of the time."},{"from":2380.93,"to":2387.36,"location":2,"content":"And so the challenge was how to build a system that did better than this baseline."},{"from":2387.36,"to":2389.22,"location":2,"content":"And so he was well aware of it,"},{"from":2389.22,"to":2390.78,"location":2,"content":"you know, it was a dumb algorithm,"},{"from":2390.78,"to":2395.58,"location":2,"content":"but he proposed that as a good baseline for doing coreference resolution."},{"from":2395.58,"to":2397.92,"location":2,"content":"So what he was interested in,"},{"from":2397.92,"to":2400.98,"location":2,"content":"um, remember that we're back in the 1970s here,"},{"from":2400.98,"to":2406.86,"location":2,"content":"was how to do knowledge-based pronominal coreference resolution."},{"from":2406.86,"to":2412.25,"location":2,"content":"And so, um, essentially what he was noticing is well,"},{"from":2412.25,"to":2416.36,"location":2,"content":"these kinds of syntactic factors that I was mentioning prefer subjects,"},{"from":2416.36,"to":2418.43,"location":2,"content":"prefer close by, etc,"},{"from":2418.43,"to":2420.78,"location":2,"content":"they're all useful predictors."},{"from":2420.78,"to":2423.83,"location":2,"content":"But there are lots of cases where they don't give the right answer,"},{"from":2423.83,"to":2425.75,"location":2,"content":"and to know when they give, when,"},{"from":2425.75,"to":2429.01,"location":2,"content":"to know what's really the coreferent thing,"},{"from":2429.01,"to":2433,"location":2,"content":"you have to actually understand what's being described in the world."},{"from":2433,"to":2435.11,"location":2,"content":"So if I have this sentence,"},{"from":2435.11,"to":2439.01,"location":2,"content":"she poured water from the pitcher into the cup until it was full."},{"from":2439.01,"to":2444.53,"location":2,"content":"What is it coreferent with?"},{"from":2444.53,"to":2445.74,"location":2,"content":"Cup."},{"from":2445.74,"to":2446.7,"location":2,"content":"[NOISE] The cup."},{"from":2446.7,"to":2448.35,"location":2,"content":"Thank you. [LAUGHTER] Okay."},{"from":2448.35,"to":2450.87,"location":2,"content":"So that, it refers to the cup."},{"from":2450.87,"to":2453.22,"location":2,"content":"But then let's look at this example."},{"from":2453.22,"to":2457.53,"location":2,"content":"She poured water from the pitcher into the cup until it was empty."},{"from":2457.53,"to":2459.38,"location":2,"content":"What does it refer to?"},{"from":2459.38,"to":2459.9,"location":2,"content":"The [OVERLAPPING]."},{"from":2459.9,"to":2461.78,"location":2,"content":"The pitcher. [LAUGHTER] Okay."},{"from":2461.78,"to":2466.13,"location":2,"content":"So the crucial thing to notice in these two sentences is,"},{"from":2466.13,"to":2470.94,"location":2,"content":"these sentences have identical syntactic structure, right."},{"from":2470.94,"to":2475.7,"location":2,"content":"So Jerry Hobbs's algorithm can't possibly work,"},{"from":2475.7,"to":2478.36,"location":2,"content":"um, for both of these sentences."},{"from":2478.36,"to":2480.61,"location":2,"content":"It's gonna work for one of them,"},{"from":2480.61,"to":2482.55,"location":2,"content":"but not the other one."},{"from":2482.55,"to":2486.03,"location":2,"content":"Um, since it's working from left to right within a sentence,"},{"from":2486.03,"to":2488.86,"location":2,"content":"it's gonna say the pitcher both times actually, right."},{"from":2488.86,"to":2495.36,"location":2,"content":"So you can't get the answer right by Jerry Hobbs' algorithm and Jerry believed,"},{"from":2495.36,"to":2497.41,"location":2,"content":"and still believes, um,"},{"from":2497.41,"to":2500.47,"location":2,"content":"that the only way to get these kind of examples right,"},{"from":2500.47,"to":2503.41,"location":2,"content":"is actually if you understand the world,"},{"from":2503.41,"to":2506.64,"location":2,"content":"and you actually know what's going on in the world,"},{"from":2506.64,"to":2509.04,"location":2,"content":"so you can see what, what this is talking about."},{"from":2509.04,"to":2511.02,"location":2,"content":"And there are lots of examples like this."},{"from":2511.02,"to":2513.76,"location":2,"content":"Um, this is another very famous example."},{"from":2513.76,"to":2518.31,"location":2,"content":"The city council refused the women a permit because they feared violence."},{"from":2518.31,"to":2520.14,"location":2,"content":"Um, who does that they refer to?"},{"from":2520.14,"to":2521.55,"location":2,"content":"[inaudible]."},{"from":2521.55,"to":2523.13,"location":2,"content":"The city councilors."},{"from":2523.13,"to":2525.36,"location":2,"content":"Um, but here's another sentence."},{"from":2525.36,"to":2530.41,"location":2,"content":"The city council refused the women a permit because they advocated violence."},{"from":2530.41,"to":2532.78,"location":2,"content":"Who does that they refer to?"},{"from":2532.78,"to":2534,"location":2,"content":"The women."},{"from":2534,"to":2536.34,"location":2,"content":"The women. Okay. So this time it refers to the women."},{"from":2536.34,"to":2538.24,"location":2,"content":"Um, and again, you know,"},{"from":2538.24,"to":2544.18,"location":2,"content":"identical syntactic structure, it couldn't possibly be done right by the Hobbs algorithm."},{"from":2544.18,"to":2547.36,"location":2,"content":"Um, so this particular pair of examples,"},{"from":2547.36,"to":2549.27,"location":2,"content":"um, comes from Terry Winograd."},{"from":2549.27,"to":2551.61,"location":2,"content":"Um, how long ti- uh,"},{"from":2551.61,"to":2555.15,"location":2,"content":"so Terry Winograd was originally an NLP faculty, um,"},{"from":2555.15,"to":2559.41,"location":2,"content":"he sort of got disillusioned with NLP because there wasn't making much progress, um,"},{"from":2559.41,"to":2562.07,"location":2,"content":"and ventured off into the land of HCI,"},{"from":2562.07,"to":2563.88,"location":2,"content":"um, that became his career."},{"from":2563.88,"to":2566.25,"location":2,"content":"Um, but in his early work, um,"},{"from":2566.25,"to":2568.11,"location":2,"content":"he was interested in these phenomena,"},{"from":2568.11,"to":2570.15,"location":2,"content":"and came up with this example."},{"from":2570.15,"to":2572.95,"location":2,"content":"And so this example really stuck with people."},{"from":2572.95,"to":2576.03,"location":2,"content":"And so these kind of contrasts are referred to by"},{"from":2576.03,"to":2579.61,"location":2,"content":"other people as Winograd sentences or Winograd schema."},{"from":2579.61,"to":2584.51,"location":2,"content":"And so this is actually something that's interesting that's revived recently."},{"from":2584.51,"to":2587.19,"location":2,"content":"Um, so Hector Le- Levesque, um,"},{"from":2587.19,"to":2589.88,"location":2,"content":"wrote a paper, I guess five years ago now,"},{"from":2589.88,"to":2594.03,"location":2,"content":"where he was trying to advocate for return to doing"},{"from":2594.03,"to":2598.51,"location":2,"content":"more in the way of knowledge and world modeling and artificial intelligence,"},{"from":2598.51,"to":2601.71,"location":2,"content":"and arguing that there are lots of problems that you just"},{"from":2601.71,"to":2605.59,"location":2,"content":"can't solve by the kind of crude statistical methods,"},{"from":2605.59,"to":2608.25,"location":2,"content":"that our machine learning systems are using."},{"from":2608.25,"to":2611.37,"location":2,"content":"And that you really needed to do more world understanding."},{"from":2611.37,"to":2613.08,"location":2,"content":"And so he proposed that"},{"from":2613.08,"to":2618.24,"location":2,"content":"these Winograd schema would be a good te- alternative to the Turing test,"},{"from":2618.24,"to":2620.76,"location":2,"content":"as a way of measuring intelligence."},{"from":2620.76,"to":2623.85,"location":2,"content":"And actually they're just coreference decisions, right."},{"from":2623.85,"to":2627.16,"location":2,"content":"So, um, so there's sort of a claim here that,"},{"from":2627.16,"to":2630.68,"location":2,"content":"if you can do a coreference right 100 percent of the time,"},{"from":2630.68,"to":2634.11,"location":2,"content":"you've solved artificial intelligence in that you're, sort of you can,"},{"from":2634.11,"to":2638.74,"location":2,"content":"can code knowledge of the world into coreference problems."},{"from":2638.74,"to":2644.01,"location":2,"content":"Um, yes so people have then tried to work on these Winograd schemas,"},{"from":2644.01,"to":2647.07,"location":2,"content":"and Levesque's feeling was, you know,"},{"from":2647.07,"to":2649.66,"location":2,"content":"you just couldn't do these,"},{"from":2649.66,"to":2651.24,"location":2,"content":"um, using kind of,"},{"from":2651.24,"to":2654.36,"location":2,"content":"the kind of statistical factors, um,"},{"from":2654.36,"to":2657.87,"location":2,"content":"that people put into their machine learning systems."},{"from":2657.87,"to":2662.52,"location":2,"content":"He was partly wrong about that because subsequent work, um,"},{"from":2662.52,"to":2667.05,"location":2,"content":"both neural systems and otherwise has shown that actually you can"},{"from":2667.05,"to":2672.16,"location":2,"content":"get f- a nontrivial distance with these kind of problems because, you know,"},{"from":2672.16,"to":2673.71,"location":2,"content":"if it is the case,"},{"from":2673.71,"to":2675.27,"location":2,"content":"um, that, you know,"},{"from":2675.27,"to":2678.12,"location":2,"content":"you can somehow see enough examples,"},{"from":2678.12,"to":2681.12,"location":2,"content":"where the city council refuses permits,"},{"from":2681.12,"to":2682.76,"location":2,"content":"fearing violence, you know."},{"from":2682.76,"to":2684.39,"location":2,"content":"If you've go- if you're collecting"},{"from":2684.39,"to":2688.56,"location":2,"content":"your neural language model over tens of billions of words,"},{"from":2688.56,"to":2691.53,"location":2,"content":"you might have seen some instances of things like that,"},{"from":2691.53,"to":2694.84,"location":2,"content":"and you could sort of predict it just on statistical patterning."},{"from":2694.84,"to":2696.01,"location":2,"content":"But the question is, you know,"},{"from":2696.01,"to":2698.34,"location":2,"content":"how far can you actually get doing that,"},{"from":2698.34,"to":2700.61,"location":2,"content":"without having a bit more of a world model?"},{"from":2700.61,"to":2702.03,"location":2,"content":"And so that was, you know,"},{"from":2702.03,"to":2705.7,"location":2,"content":"what Hobbs was interested in way back in 1978."},{"from":2705.7,"to":2710.01,"location":2,"content":"So he wrote, the naive approach is quite good,"},{"from":2710.01,"to":2714.81,"location":2,"content":"computationally speaking it will be a long time before a semantically based algorithm,"},{"from":2714.81,"to":2717.59,"location":2,"content":"is sophisticated enough to perform as well."},{"from":2717.59,"to":2721.65,"location":2,"content":"And these results set a very high standard for any other approach to aim for."},{"from":2721.65,"to":2723.99,"location":2,"content":"He was totally right about that, um,"},{"from":2723.99,"to":2728.66,"location":2,"content":"that it really wasn't until the 2010s that anybody"},{"from":2728.66,"to":2733.83,"location":2,"content":"managed to produce an algorithm for pronominal anaphora resolution,"},{"from":2733.83,"to":2735.9,"location":2,"content":"that outperformed the Hobbs algorithm."},{"from":2735.9,"to":2738.42,"location":2,"content":"Even though it was just, uh,"},{"from":2738.42,"to":2740.53,"location":2,"content":"what he called a naive algorithm,"},{"from":2740.53,"to":2744,"location":2,"content":"or he might call a crude set of linguistic rules."},{"from":2744,"to":2746.32,"location":2,"content":"Um, but he says,"},{"from":2746.32,"to":2750.09,"location":2,"content":"yet there is every reason to pursue a semantically based approach,"},{"from":2750.09,"to":2752.32,"location":2,"content":"the naive algorithm does not work."},{"from":2752.32,"to":2755.05,"location":2,"content":"Anyone can think of examples where it fails."},{"from":2755.05,"to":2757.41,"location":2,"content":"In these cases it not only fails,"},{"from":2757.41,"to":2759.66,"location":2,"content":"it gives no indication that it has failed,"},{"from":2759.66,"to":2763.17,"location":2,"content":"and offers no help in finding the real antecedent."},{"from":2763.17,"to":2765.51,"location":2,"content":"Um, so food for thought there."},{"from":2765.51,"to":2768.14,"location":2,"content":"Um, but, um, notwithstanding that,"},{"from":2768.14,"to":2770.4,"location":2,"content":"I'm gonna just rush ahead at this point,"},{"from":2770.4,"to":2772.35,"location":2,"content":"and tell you about some of the, um,"},{"from":2772.35,"to":2774.45,"location":2,"content":"statistical and neural algorithms,"},{"from":2774.45,"to":2777.12,"location":2,"content":"um, that have been used for coreference resolution."},{"from":2777.12,"to":2780.95,"location":2,"content":"So the simplest form of algorithm that's commonly used,"},{"from":2780.95,"to":2784.22,"location":2,"content":"is what is called mention pair models."},{"from":2784.22,"to":2789.14,"location":2,"content":"So what we mean by mention pair models is, um,"},{"from":2789.14,"to":2792.03,"location":2,"content":"we are gonna take pairs of mentions,"},{"from":2792.03,"to":2796.11,"location":2,"content":"and we're gonna train a binary classifier that says,"},{"from":2796.11,"to":2799.23,"location":2,"content":"is coreferent or isn't coreferent."},{"from":2799.23,"to":2804.11,"location":2,"content":"And so then we're gonna proceed left to right through the text."},{"from":2804.11,"to":2809.32,"location":2,"content":"And every time we get to a new mention,"},{"from":2809.32,"to":2815.89,"location":2,"content":"we're gonna then evaluate our classifier with respect to every preceding mention,"},{"from":2815.89,"to":2818.84,"location":2,"content":"and we're gonna say, are they coreferent?"},{"from":2818.84,"to":2821.24,"location":2,"content":"And it's gonna say yes or no."},{"from":2821.24,"to":2823.5,"location":2,"content":"And we're gonna find out that some of them."},{"from":2823.5,"to":2826.16,"location":2,"content":"It says yes for, um,"},{"from":2826.16,"to":2830.16,"location":2,"content":"I voted for Nader because he was like, most aligned with my value."},{"from":2830.16,"to":2833.1,"location":2,"content":"She said, if we have a good classifier,"},{"from":2833.1,"to":2838.47,"location":2,"content":"it will say yes to the two bu- blue ones and not to the rest of them."},{"from":2838.47,"to":2842.19,"location":2,"content":"Um, and so then we'll have at training time,"},{"from":2842.19,"to":2846.54,"location":2,"content":"negative examples that Nader and he are negative examples."},{"from":2846.54,"to":2850.86,"location":2,"content":"[NOISE] So if you have data marked for coreference,"},{"from":2850.86,"to":2853.53,"location":2,"content":"we have the sort of positive and negative examples,"},{"from":2853.53,"to":2855.12,"location":2,"content":"and we can train a model."},{"from":2855.12,"to":2856.98,"location":2,"content":"And so for training a model,"},{"from":2856.98,"to":2862.23,"location":2,"content":"we have a sort of the classifier outcome is one or zero,"},{"from":2862.23,"to":2865.66,"location":2,"content":"based on whether two mentions are coreferent."},{"from":2865.66,"to":2867.96,"location":2,"content":"We're gonna have a coreference model that"},{"from":2867.96,"to":2870.96,"location":2,"content":"predicts the probability of them being coreferent."},{"from":2870.96,"to":2874.35,"location":2,"content":"And we're gonna train it with the same kind of cross entropy loss,"},{"from":2874.35,"to":2877.28,"location":2,"content":"we've used other places and, um,"},{"from":2877.28,"to":2881.16,"location":2,"content":"try and learn a model that predicts coreference."},{"from":2881.16,"to":2884.55,"location":2,"content":"And so then when we get to test time, um,"},{"from":2884.55,"to":2888.49,"location":2,"content":"and we have a piece of text with mentions, um,"},{"from":2888.49,"to":2892.23,"location":2,"content":"we're gonna run this classifier and it's gonna say,"},{"from":2892.23,"to":2896.4,"location":2,"content":"um, yes or no, with some probability."},{"from":2896.4,"to":2899.2,"location":2,"content":"And if we pick a threshold like 0.5,"},{"from":2899.2,"to":2902.57,"location":2,"content":"we'll add certain coreference links."},{"from":2902.57,"to":2905.49,"location":2,"content":"And that sort of looks pretty good."},{"from":2905.49,"to":2909.48,"location":2,"content":"Um, but we're gonna sort of complete it off by saying well,"},{"from":2909.48,"to":2914.28,"location":2,"content":"if A is coreferent to B and B is K coreferent to C. Then really"},{"from":2914.28,"to":2920.04,"location":2,"content":"also A is coreferent to C. So we're gonna do a transitive closure,"},{"from":2920.04,"to":2922.41,"location":2,"content":"and that will give us our clustering."},{"from":2922.41,"to":2926.16,"location":2,"content":"Um, note here that there's a certain danger in this."},{"from":2926.16,"to":2928.65,"location":2,"content":"Because this means, if we make,"},{"from":2928.65,"to":2931.76,"location":2,"content":"since we're sor- with the transitive closure,"},{"from":2931.76,"to":2934.43,"location":2,"content":"that's always adding clustering links."},{"from":2934.43,"to":2938.31,"location":2,"content":"And so that means the danger is that we're gonna over cluster,"},{"from":2938.31,"to":2944.4,"location":2,"content":"because if we make a single mistake and we link things that should be kept separate."},{"from":2944.4,"to":2946.92,"location":2,"content":"So for example, if we wrongly said,"},{"from":2946.92,"to":2948.87,"location":2,"content":"he and my are coreferent,"},{"from":2948.87,"to":2950.57,"location":2,"content":"then everything of this, um,"},{"from":2950.57,"to":2953.77,"location":2,"content":"discourse would collapse together into one cluster,"},{"from":2953.77,"to":2956.92,"location":2,"content":"and everything would be deemed coreferent."},{"from":2956.92,"to":2960.99,"location":2,"content":"Okay, um, and this,"},{"from":2960.99,"to":2965.48,"location":2,"content":"something that I haven't really emphasized, but comes up,"},{"from":2965.48,"to":2970.07,"location":2,"content":"is well, there's some mentions that are coreferent to nothing, right."},{"from":2970.07,"to":2972.89,"location":2,"content":"In the Shruthi Rao story, there was a park,"},{"from":2972.89,"to":2975.88,"location":2,"content":"which was just mentioned once in the text, and so on,"},{"from":2975.88,"to":2978.19,"location":2,"content":"in this form of algorithm,"},{"from":2978.19,"to":2981.54,"location":2,"content":"what we'd like the classifier to say is, no,"},{"from":2981.54,"to":2982.76,"location":2,"content":"no, no, no, no,"},{"from":2982.76,"to":2984.61,"location":2,"content":"for all of the decisions."},{"from":2984.61,"to":2986.97,"location":2,"content":"And so it's deemed coreferent to nothing."},{"from":2986.97,"to":2989.91,"location":2,"content":"And then it's just a singleton mention."},{"from":2989.91,"to":2992.36,"location":2,"content":"This sort of works,"},{"from":2992.36,"to":2998.66,"location":2,"content":"but it hasn't proven to be the best way of doing coreference."},{"from":2998.66,"to":3003.59,"location":2,"content":"And a lot of the reason why it's not the best way to do coreference"},{"from":3003.59,"to":3009.41,"location":2,"content":"is because we have this phenomenon of anaphora where we have textural dependence."},{"from":3009.41,"to":3011.07,"location":2,"content":"A lot of the time,"},{"from":3011.07,"to":3014.53,"location":2,"content":"it seems that we're not really,"},{"from":3014.53,"to":3019.82,"location":2,"content":"um, what- sort of wanting to make this all coreference decisions."},{"from":3019.82,"to":3024.18,"location":2,"content":"We'd like to make the anaphora decisions of textural dependence."},{"from":3024.18,"to":3027.41,"location":2,"content":"So we'd like to say that he is,"},{"from":3027.41,"to":3033.14,"location":2,"content":"um, dependent on Nader and my is dependent on I."},{"from":3033.14,"to":3035.11,"location":2,"content":"These are anaphora relationships."},{"from":3035.11,"to":3041.28,"location":2,"content":"So we'd like to just choose one example of what is this anaphora relationship."},{"from":3041.28,"to":3044.87,"location":2,"content":"And so that's led to people then looking at what is called,"},{"from":3044.87,"to":3047.47,"location":2,"content":"um, Mention Pair Models, right?"},{"from":3047.47,"to":3052.1,"location":2,"content":"That the problem is that if we have a long document with lots of mentions,"},{"from":3052.1,"to":3057.05,"location":2,"content":"um, that we want to not be saying- trying to find all of them and say, yes."},{"from":3057.05,"to":3060.16,"location":2,"content":"We just want to be saying there's a particular- we"},{"from":3060.16,"to":3063.7,"location":2,"content":"just want to be saying that there's a particular one."},{"from":3063.7,"to":3065.77,"location":2,"content":"So for the he at the end here,"},{"from":3065.77,"to":3071.35,"location":2,"content":"its anaphor relationship is back to Nader and you don't wanna be trying to say this"},{"from":3071.35,"to":3077.6,"location":2,"content":"he is also coreferent back to all of these other things that are earlier in the text."},{"from":3077.6,"to":3082.36,"location":2,"content":"So it's not something that's been explored much."},{"from":3082.36,"to":3084.94,"location":2,"content":"But arguably, this is a case again,"},{"from":3084.94,"to":3090.91,"location":2,"content":"where you should be separating coreference from anaphors because for anaphors it seems like"},{"from":3090.91,"to":3092.98,"location":2,"content":"the right way to think is that they have"},{"from":3092.98,"to":3097.63,"location":2,"content":"one prior thing in the text that they're textually dependent on."},{"from":3097.63,"to":3103.34,"location":2,"content":"Whereas true coreferents, when you just have various mentions in the text of Ralph Nader,"},{"from":3103.34,"to":3104.65,"location":2,"content":"this Ralph Nader that,"},{"from":3104.65,"to":3108.05,"location":2,"content":"Nader did that, those aren't textually dependent"},{"from":3108.05,"to":3112.07,"location":2,"content":"and they should all be being grouped together as coreferents."},{"from":3112.07,"to":3118.52,"location":2,"content":"Um, but our models sort of don't normally try and do some one way and some the other way,"},{"from":3118.52,"to":3120.72,"location":2,"content":"but you choose one of the models."},{"from":3120.72,"to":3122.82,"location":2,"content":"So in the other one,"},{"from":3122.82,"to":3126.01,"location":2,"content":"we do it for- to do the other way,"},{"from":3126.01,"to":3128.16,"location":2,"content":"you do what's mention rankings."},{"from":3128.16,"to":3129.7,"location":2,"content":"So for mention ranking,"},{"from":3129.7,"to":3133.4,"location":2,"content":"the idea is for each mention,"},{"from":3133.4,"to":3135.55,"location":2,"content":"we're going to find- try and find it"},{"from":3135.55,"to":3140.64,"location":2,"content":"an antecedent that comes before- before it in the text,"},{"from":3140.64,"to":3142.65,"location":2,"content":"that is- that it is, um,"},{"from":3142.65,"to":3146.81,"location":2,"content":"coreferent with, and we're going to make a one of N decision."},{"from":3146.81,"to":3149.51,"location":2,"content":"So that when we see she here,"},{"from":3149.51,"to":3150.82,"location":2,"content":"we're going to say,"},{"from":3150.82,"to":3155.4,"location":2,"content":"\"Okay, um, what is this coreferent with?\""},{"from":3155.4,"to":3157.66,"location":2,"content":"And we're going to pick one thing that it's coreferent"},{"from":3157.66,"to":3161.13,"location":2,"content":"with even though there might be others in the text."},{"from":3161.13,"to":3164.16,"location":2,"content":"Um, so if we're doing that,"},{"from":3164.16,"to":3167.49,"location":2,"content":"we then have a problem with singleton mentions because if"},{"from":3167.49,"to":3171.16,"location":2,"content":"we're trying to- for every mention we find say,"},{"from":3171.16,"to":3175.43,"location":2,"content":"choose the thing that came before it in the text with which it's coreferent,"},{"from":3175.43,"to":3178.57,"location":2,"content":"the right answer might be that there's no such thing."},{"from":3178.57,"to":3180.58,"location":2,"content":"So what we do is we add"},{"from":3180.58,"to":3186.41,"location":2,"content":"one additional dummy mention right at the front here, the NA mention."},{"from":3186.41,"to":3191.34,"location":2,"content":"So one choice is you're gonna say there isn't anything preceding."},{"from":3191.34,"to":3193.93,"location":2,"content":"So effectively, when you get to I,"},{"from":3193.93,"to":3197.53,"location":2,"content":"since this is, um, the first, um,"},{"from":3197.53,"to":3199.26,"location":2,"content":"real mention in the text,"},{"from":3199.26,"to":3201.78,"location":2,"content":"you're necessarily gonna choose as,"},{"from":3201.78,"to":3204.26,"location":2,"content":"um, its antecedent NA."},{"from":3204.26,"to":3207.82,"location":2,"content":"You then go on to Nader and you have two choices."},{"from":3207.82,"to":3214.11,"location":2,"content":"You can either say it's coreferent to I or it's coreferent to NA."},{"from":3214.11,"to":3218.41,"location":2,"content":"I, it's a new mention- a new entity that's being mentioned in the text and"},{"from":3218.41,"to":3223.28,"location":2,"content":"the right answer is it's a new mention in- a new entity being mentioned in the text."},{"from":3223.28,"to":3226.93,"location":2,"content":"Then you get to he and now you have three choices,"},{"from":3226.93,"to":3231.11,"location":2,"content":"and the right thing is to say that it's coreferent to Nader."},{"from":3231.11,"to":3235.2,"location":2,"content":"Okay. Um, so this time,"},{"from":3235.2,"to":3237.64,"location":2,"content":"it's- for training our models,"},{"from":3237.64,"to":3240.53,"location":2,"content":"it's sort of the same, um,"},{"from":3240.53,"to":3244.51,"location":2,"content":"apart from this, sort of this different one of semantics."},{"from":3244.51,"to":3249.82,"location":2,"content":"So now- previously, we wanted to say that for our, um,"},{"from":3249.82,"to":3255.03,"location":2,"content":"mention pair classifier that is going to try and classify I and she,"},{"from":3255.03,"to":3256.36,"location":2,"content":"and my and she,"},{"from":3256.36,"to":3259.22,"location":2,"content":"and both of them had to get a high score,"},{"from":3259.22,"to":3262.03,"location":2,"content":"where now it's sufficient that just one of them gets"},{"from":3262.03,"to":3266.15,"location":2,"content":"a high score because that's sort of enough for us to do."},{"from":3266.15,"to":3270.61,"location":2,"content":"So what we're gonna use is our good old softmax and so for she,"},{"from":3270.61,"to":3274.55,"location":2,"content":"we're gonna put a softmax over the antecedents."},{"from":3274.55,"to":3279.66,"location":2,"content":"And our hope is simply that we get a high probability with one of the antecedents,"},{"from":3279.66,"to":3283.7,"location":2,"content":"if it has an antecedent or a high score with NA,"},{"from":3283.7,"to":3286.76,"location":2,"content":"if it doesn't have any prior referents."},{"from":3286.76,"to":3291.36,"location":2,"content":"And so then when we're doing classification at run-time,"},{"from":3291.36,"to":3296.36,"location":2,"content":"we're going to sort of add only the highest scoring coreference link."},{"from":3296.36,"to":3298.99,"location":2,"content":"So that means we train it just slightly"},{"from":3298.99,"to":3303.18,"location":2,"content":"differently because now what we're going to do is that,"},{"from":3303.18,"to":3306.2,"location":2,"content":"when we're- what we're wanting to say is,"},{"from":3306.2,"to":3313.03,"location":2,"content":"we want a high score of coreference between at least one of the antecedents."},{"from":3313.03,"to":3315.01,"location":2,"content":"And so one possible model is,"},{"from":3315.01,"to":3317.3,"location":2,"content":"we can maximize this probability."},{"from":3317.3,"to":3321.11,"location":2,"content":"So for the ones that are coreferent in the gold standard data,"},{"from":3321.11,"to":3324.89,"location":2,"content":"we want the sum of their assigned probabilities to be high."},{"from":3324.89,"to":3331.14,"location":2,"content":"And so what that means is that it's sort of sufficient if we have,"},{"from":3331.14,"to":3334.3,"location":2,"content":"um, one of them giving"},{"from":3334.3,"to":3338.38,"location":2,"content":"a high probability and they don't all have to give a high probability."},{"from":3338.38,"to":3341.22,"location":2,"content":"So providing it's giving 0.9 probability,"},{"from":3341.22,"to":3343.53,"location":2,"content":"say it a one of the correct antecedents,"},{"from":3343.53,"to":3345.7,"location":2,"content":"we're getting a high score."},{"from":3345.7,"to":3349.66,"location":2,"content":"Okay. So we're gonna turn that into a loss function in the kind of"},{"from":3349.66,"to":3353.59,"location":2,"content":"standard way we do in which we take log probabilities,"},{"from":3353.59,"to":3356.26,"location":2,"content":"um, and then we want to, um,"},{"from":3356.26,"to":3358.59,"location":2,"content":"or negative log probabilities to give us"},{"from":3358.59,"to":3362.15,"location":2,"content":"a loss and then we're wanting to minimize that loss."},{"from":3362.15,"to":3365.91,"location":2,"content":"So with the mention ranking model,"},{"from":3365.91,"to":3367.87,"location":2,"content":"um, at test time,"},{"from":3367.87,"to":3369.36,"location":2,"content":"it's pretty much the same,"},{"from":3369.36,"to":3376.28,"location":2,"content":"but our softmax classifier is just going to assign one antecedent for each mention."},{"from":3376.28,"to":3380.47,"location":2,"content":"And so we're then gonna hope that those sort of give us the kind"},{"from":3380.47,"to":3385.82,"location":2,"content":"of clusters that we want and there's no subsequent clustering phase."},{"from":3385.82,"to":3390.88,"location":2,"content":"So there's a big part of this that I left out which was,"},{"from":3390.88,"to":3392.51,"location":2,"content":"I've just said, \"Okay,"},{"from":3392.51,"to":3398.59,"location":2,"content":"we have this probability of MI and MJ as the- are they coreferent?\""},{"from":3398.59,"to":3401.05,"location":2,"content":"But I've sort of said, zero as to"},{"from":3401.05,"to":3403.76,"location":2,"content":"how you can determine whether they're coreferent or not."},{"from":3403.76,"to":3406.64,"location":2,"content":"Um, so briefly, um,"},{"from":3406.64,"to":3409.78,"location":2,"content":"here- here's the classical way of doing it."},{"from":3409.78,"to":3411.89,"location":2,"content":"The classical way of doing it is,"},{"from":3411.89,"to":3416.09,"location":2,"content":"you had a whole bunch of features and you had"},{"from":3416.09,"to":3420.48,"location":2,"content":"a feature based statistical classifier which gave a score."},{"from":3420.48,"to":3422.65,"location":2,"content":"And these are the kind of features you could use."},{"from":3422.65,"to":3426.49,"location":2,"content":"So there are sort of strong features of person, number, gender agreement."},{"from":3426.49,"to":3429.72,"location":2,"content":"So if you have a masculine or feminine pronoun,"},{"from":3429.72,"to":3432.29,"location":2,"content":"you wanna find an appropriate antecedent for it."},{"from":3432.29,"to":3436.63,"location":2,"content":"There are weaker, um, semantic compatibility features."},{"from":3436.63,"to":3438.98,"location":2,"content":"So the mining conglomerate, the company,"},{"from":3438.98,"to":3441.76,"location":2,"content":"the conglomerate might be sort of similar to a company."},{"from":3441.76,"to":3445.7,"location":2,"content":"You could use something like word2vec similarity and assess that."},{"from":3445.7,"to":3448.12,"location":2,"content":"There are syntactic constraints."},{"from":3448.12,"to":3450.73,"location":2,"content":"So this is then kind of like, um,"},{"from":3450.73,"to":3454.57,"location":2,"content":"what Hobbs's algorithm was all about us working out"},{"from":3454.57,"to":3458.7,"location":2,"content":"how likely different syntactic configurations are gonna mean coreference."},{"from":3458.7,"to":3460.95,"location":2,"content":"And indeed it is the case, you know,"},{"from":3460.95,"to":3466.15,"location":2,"content":"that a lot of these feature-based systems used Hobbs' algorithm as a feature inside"},{"from":3466.15,"to":3471.86,"location":2,"content":"the system that was weighted and was normally a very strong feature to decide coreference."},{"from":3471.86,"to":3475.43,"location":2,"content":"Um, there are lots of other things you can put in as features."},{"from":3475.43,"to":3476.86,"location":2,"content":"Um, recency."},{"from":3476.86,"to":3478.24,"location":2,"content":"So John went to a movie,"},{"from":3478.24,"to":3479.29,"location":2,"content":"Jack went as well,"},{"from":3479.29,"to":3480.6,"location":2,"content":"he was not busy."},{"from":3480.6,"to":3485.18,"location":2,"content":"The most likely referent for he is the closer candidate Jack."},{"from":3485.18,"to":3490.12,"location":2,"content":"Um, I've mentioned subjects are more likely to be, um, the antecedent."},{"from":3490.12,"to":3491.55,"location":2,"content":"John went to a movie with Jack,"},{"from":3491.55,"to":3492.93,"location":2,"content":"he was not busy."},{"from":3492.93,"to":3495.99,"location":2,"content":"Um, John seems a more likely antecedent."},{"from":3495.99,"to":3498.16,"location":2,"content":"So that's the sort of subject preference."},{"from":3498.16,"to":3500.39,"location":2,"content":"There's also a parallelism preference."},{"from":3500.39,"to":3502.3,"location":2,"content":"So John went with Jack to a movie,"},{"from":3502.3,"to":3504.17,"location":2,"content":"Joe went with him to a bar."},{"from":3504.17,"to":3508.41,"location":2,"content":"I think it's sort of reasonable to think that him there is probably Jack,"},{"from":3508.41,"to":3512.82,"location":2,"content":"and that's sort of for parallelism reasons as opposed to going with the subject."},{"from":3512.82,"to":3516.48,"location":2,"content":"So there are various kind of linguistic features and constraints and so on,"},{"from":3516.48,"to":3519.97,"location":2,"content":"and you can throw these all into a statistical classifier and that's"},{"from":3519.97,"to":3524.91,"location":2,"content":"sort of 2000s decade coref systems as to how they're built."},{"from":3524.91,"to":3529.35,"location":2,"content":"Um, more recently, people have built neural systems."},{"from":3529.35,"to":3530.56,"location":2,"content":"And so for these,"},{"from":3530.56,"to":3533.81,"location":2,"content":"we are kind of normally using the same kind of embeddings."},{"from":3533.81,"to":3538.25,"location":2,"content":"So we'll have a candidate antecedent that will have embeddings,"},{"from":3538.25,"to":3540.51,"location":2,"content":"we'll have a mention that has embeddings."},{"from":3540.51,"to":3541.78,"location":2,"content":"And this will be something like"},{"from":3541.78,"to":3545.59,"location":2,"content":"average word vectors or something like that for the mention."},{"from":3545.59,"to":3549.93,"location":2,"content":"And we're gonna feed these into a neural network that will give us our score."},{"from":3549.93,"to":3553.07,"location":2,"content":"But what you find is that"},{"from":3553.07,"to":3556.99,"location":2,"content":"most of these systems as well as having something like word vectors,"},{"from":3556.99,"to":3560.61,"location":2,"content":"they also have additional features, um,"},{"from":3560.61,"to":3563.91,"location":2,"content":"and these features still capture some of"},{"from":3563.91,"to":3568.26,"location":2,"content":"the things that were in the feature-based statistical classifiers."},{"from":3568.26,"to":3571.86,"location":2,"content":"So there will be often features that reflect things like,"},{"from":3571.86,"to":3577.27,"location":2,"content":"what grammatical relation does this mention have? Is it a subject?"},{"from":3577.27,"to":3578.51,"location":2,"content":"Is it an object?"},{"from":3578.51,"to":3582.82,"location":2,"content":"That's something you could put into the features of a mention."},{"from":3582.82,"to":3586.57,"location":2,"content":"But then, closer things are more likely to be coreferent."},{"from":3586.57,"to":3591.27,"location":2,"content":"So you might have additional features here which record how far apart dimensions are,"},{"from":3591.27,"to":3594.18,"location":2,"content":"and those things get thrown in as well."},{"from":3594.18,"to":3600.93,"location":2,"content":"Um, and so these kind of features are still important even in neural systems."},{"from":3600.93,"to":3607.16,"location":2,"content":"And so I'll skip ahead now and show you a bit about, um,"},{"from":3607.16,"to":3611.45,"location":2,"content":"what is the kind of current state of the art for coreference resolution,"},{"from":3611.45,"to":3614.95,"location":2,"content":"and this was a system that was done at the University of Washington in"},{"from":3614.95,"to":3620.49,"location":2,"content":"2017 by Kenton Lee and assorted other, um, authors."},{"from":3620.49,"to":3626.91,"location":2,"content":"Um, so the goal here was to produce an end-to-end coreference system that it was text in,"},{"from":3626.91,"to":3630.72,"location":2,"content":"um, mention clusters that are coreferent out."},{"from":3630.72,"to":3635.34,"location":2,"content":"Um, and so they're wanting to use sort of a more complex"},{"from":3635.34,"to":3640.42,"location":2,"content":"neural network that can do the whole thing end-to-end. So I'll go through,"},{"from":3640.42,"to":3641.73,"location":2,"content":"um, the steps of that."},{"from":3641.73,"to":3645.99,"location":2,"content":"So the first step is we just start off with words."},{"from":3645.99,"to":3647.7,"location":2,"content":"And so for each word,"},{"from":3647.7,"to":3653.02,"location":2,"content":"we're going to look up a word embedding for it and that's in other stuff we've seen."},{"from":3653.02,"to":3655.8,"location":2,"content":"We're also going to put in a character level CNN,"},{"from":3655.8,"to":3660.89,"location":2,"content":"and the two of those concatenated are going to give the representation of each token."},{"from":3660.89,"to":3662.6,"location":2,"content":"That much should look familiar."},{"from":3662.6,"to":3665.07,"location":2,"content":"Okay. Then after that,"},{"from":3665.07,"to":3671.26,"location":2,"content":"we're going to run a deep bidirectional LSTM back and forth across the sentence."},{"from":3671.26,"to":3675.44,"location":2,"content":"Again, that should look familiar from stuff that we've seen before."},{"from":3675.44,"to":3681.7,"location":2,"content":"Um, the next step gets us a bit into doing something more special, um,"},{"from":3681.7,"to":3684.11,"location":2,"content":"For coreference."},{"from":3684.11,"to":3690.79,"location":2,"content":"So what they wanted to do after that is have a representation for spans."},{"from":3690.79,"to":3692.64,"location":2,"content":"And so by span,"},{"from":3692.64,"to":3698.05,"location":2,"content":"we mean any contiguous subphrase of the word, of the sentence."},{"from":3698.05,"to":3700.03,"location":2,"content":"So this is a span."},{"from":3700.03,"to":3701.38,"location":2,"content":"This is a span."},{"from":3701.38,"to":3702.67,"location":2,"content":"This is a span."},{"from":3702.67,"to":3706.32,"location":2,"content":"Electric said the postal is a span, every sub-sequence."},{"from":3706.32,"to":3708.25,"location":2,"content":"Um, so I'll come back to that."},{"from":3708.25,"to":3710.22,"location":2,"content":"But, you know, they'll- in principle,"},{"from":3710.22,"to":3713.16,"location":2,"content":"you're working this out for every sub-sequence."},{"from":3713.16,"to":3715.36,"location":2,"content":"So for every sub-sequence,"},{"from":3715.36,"to":3718.68,"location":2,"content":"they want to come up with a span representation."},{"from":3718.68,"to":3724.97,"location":2,"content":"And so this span representation is going to be in three parts,"},{"from":3724.97,"to":3729.32,"location":2,"content":"um, that represent one of these sub-sequences."},{"from":3729.32,"to":3733.66,"location":2,"content":"Um, so each of these will get its own representation."},{"from":3733.66,"to":3735.64,"location":2,"content":"And so the question is, what?"},{"from":3735.64,"to":3738.91,"location":2,"content":"And so we have this span representation,"},{"from":3738.91,"to":3742.38,"location":2,"content":"and it's gonna be in these three parts here."},{"from":3742.38,"to":3746.77,"location":2,"content":"Um, so what these parts are is,"},{"from":3746.77,"to":3748.42,"location":2,"content":"well, first of all,"},{"from":3748.42,"to":3751.45,"location":2,"content":"we're going to have a representation, um,"},{"from":3751.45,"to":3755.03,"location":2,"content":"which is just looking at the first word of"},{"from":3755.03,"to":3760.45,"location":2,"content":"the span and the last word of the span according to the BiLSTM."},{"from":3760.45,"to":3763.14,"location":2,"content":"So if we're looking at the span, the postal service,"},{"from":3763.14,"to":3765.67,"location":2,"content":"we're going to take this BiLSTM and"},{"from":3765.67,"to":3770.24,"location":2,"content":"this BiLSTM and use them as part of the representation of the span."},{"from":3770.24,"to":3772.22,"location":2,"content":"Um, that's a good start,"},{"from":3772.22,"to":3774.73,"location":2,"content":"but then they actually do something a little tricky."},{"from":3774.73,"to":3779.71,"location":2,"content":"So kind of like when we're doing dependency parsing, the idea was,"},{"from":3779.71,"to":3782.82,"location":2,"content":"well, phrases are going to have a headword,"},{"from":3782.82,"to":3785.05,"location":2,"content":"um, so that if it's,"},{"from":3785.05,"to":3790.42,"location":2,"content":"um, you know, my younger sister that the headword of that is sister,"},{"from":3790.42,"to":3794.91,"location":2,"content":"and there- if it's something like the goat in the corner of the field,"},{"from":3794.91,"to":3796.96,"location":2,"content":"the headword of that is going to be goat."},{"from":3796.96,"to":3801.52,"location":2,"content":"So they want to find a way of capturing headwords out of the text."},{"from":3801.52,"to":3806.2,"location":2,"content":"Um, and so what they're going to do for that is use attention."},{"from":3806.2,"to":3810.82,"location":2,"content":"So they're going to say we have this span, the postal service,"},{"from":3810.82,"to":3813.64,"location":2,"content":"and we're going to use attention as"},{"from":3813.64,"to":3818.05,"location":2,"content":"a span internal mechanism to sort of approximate a head."},{"from":3818.05,"to":3822.2,"location":2,"content":"So what we're going to do, uh, here,"},{"from":3822.2,"to":3825.52,"location":2,"content":"what we're going to do is we're going to want to"},{"from":3825.52,"to":3830.02,"location":2,"content":"learn attention weights, I'm just gonna, yeah."},{"from":3830.02,"to":3834.22,"location":2,"content":"Um, what we're gonna do is for this span, um,"},{"from":3834.22,"to":3838.81,"location":2,"content":"we're going to be learning based on the hope,"},{"from":3838.81,"to":3843.25,"location":2,"content":"the ends of the span which words to pay how much attention to."},{"from":3843.25,"to":3846.97,"location":2,"content":"So we're gonna put attention weights on the different words,"},{"from":3846.97,"to":3849.73,"location":2,"content":"and then we're going to, in the usual attention way,"},{"from":3849.73,"to":3855.22,"location":2,"content":"make this weighted sum of having put the word pair-"},{"from":3855.22,"to":3859.36,"location":2,"content":"the bidirectional LSTM pairs through a feed-forward network and end"},{"from":3859.36,"to":3863.7,"location":2,"content":"up with this new representation of a weighted representation."},{"from":3863.7,"to":3865.45,"location":2,"content":"And the hope is that in this case,"},{"from":3865.45,"to":3868.63,"location":2,"content":"most of the weight will go on this final servers,"},{"from":3868.63,"to":3870.45,"location":2,"content":"which will be the headword."},{"from":3870.45,"to":3872.76,"location":2,"content":"But there'll be sort of distributed across it."},{"from":3872.76,"to":3876.05,"location":2,"content":"And so that gives them a model of"},{"from":3876.05,"to":3881.88,"location":2,"content":"sort of mentions that use both ends and hope to find the key word of the mention."},{"from":3881.88,"to":3886.01,"location":2,"content":"Okay. Um, so, um,"},{"from":3886.01,"to":3888.01,"location":2,"content":"that's two-thirds of the span,"},{"from":3888.01,"to":3891.24,"location":2,"content":"but they still have over here these additional features."},{"from":3891.24,"to":3894.24,"location":2,"content":"And so they still have some additional features."},{"from":3894.24,"to":3898.2,"location":2,"content":"They want to be able to mark speakers and addressees."},{"from":3898.2,"to":3902.2,"location":2,"content":"Um, they want to mark other things like the grammatical role."},{"from":3902.2,"to":3903.97,"location":2,"content":"But if things occur, you know,"},{"from":3903.97,"to":3907.24,"location":2,"content":"it is still useful to have some additional features."},{"from":3907.24,"to":3908.62,"location":2,"content":"And so what they do is,"},{"from":3908.62,"to":3911.86,"location":2,"content":"this is a representation of each span,"},{"from":3911.86,"to":3916.39,"location":2,"content":"and then they're going to want to say are two spans coreferent."},{"from":3916.39,"to":3922.12,"location":2,"content":"And so they're going to have one score for the two, two split, each of two spans,"},{"from":3922.12,"to":3923.36,"location":2,"content":"which is essentially saying,"},{"from":3923.36,"to":3924.85,"location":2,"content":"is that a good mention?"},{"from":3924.85,"to":3926.92,"location":2,"content":"And then you're going to have scores of,"},{"from":3926.92,"to":3929.17,"location":2,"content":"do they look coreferent?"},{"from":3929.17,"to":3934.87,"location":2,"content":"And so having calculated these representations for each span,"},{"from":3934.87,"to":3939.22,"location":2,"content":"you're running three- through things through a fully connected feed-forward network,"},{"from":3939.22,"to":3941.24,"location":2,"content":"multiplying by a weight factor,"},{"from":3941.24,"to":3942.49,"location":2,"content":"and that's giving you, uh,"},{"from":3942.49,"to":3944.64,"location":2,"content":"is that a good mention score?"},{"from":3944.64,"to":3946.96,"location":2,"content":"And then for are they coreferent,"},{"from":3946.96,"to":3949.33,"location":2,"content":"you're taking two spans,"},{"from":3949.33,"to":3953.65,"location":2,"content":"the pointwise Hadamard product of two spans and"},{"from":3953.65,"to":3956.14,"location":2,"content":"some extra features like distance apart in"},{"from":3956.14,"to":3959.47,"location":2,"content":"the text and putting them through another neural network,"},{"from":3959.47,"to":3961.28,"location":2,"content":"and that's then giving you, are"},{"from":3961.28,"to":3963.47,"location":2,"content":"these two spans coreferent?"},{"from":3963.47,"to":3965.59,"location":2,"content":"But all of these pieces,"},{"from":3965.59,"to":3970.48,"location":2,"content":"um, give you an overall loss function."},{"from":3970.48,"to":3974.82,"location":2,"content":"So you can say that your model is, um, okay."},{"from":3974.82,"to":3976.89,"location":2,"content":"We're going to run these LSTMs,"},{"from":3976.89,"to":3978.88,"location":2,"content":"we're going to take all spans,"},{"from":3978.88,"to":3980.83,"location":2,"content":"we're going to score this,"},{"from":3980.83,"to":3984.37,"location":2,"content":"and we know the gold answer for our coreference system."},{"from":3984.37,"to":3989.2,"location":2,"content":"And so we want to be predicting things that are coreferent and have"},{"from":3989.2,"to":3994.38,"location":2,"content":"a loss based on the probability that we calculate with these scores,"},{"from":3994.38,"to":3995.77,"location":2,"content":"um, as I had mentioned,"},{"from":3995.77,"to":3999.2,"location":2,"content":"ranking model using a softmax loss like before."},{"from":3999.2,"to":4002.78,"location":2,"content":"So if you put all of this together and train it end to end,"},{"from":4002.78,"to":4008.68,"location":2,"content":"you've got a whole coreference system that goes from words to coreference decisions."},{"from":4008.68,"to":4012.05,"location":2,"content":"Um, there's a huge problem with that,"},{"from":4012.05,"to":4015.81,"location":2,"content":"um, which is if you actually applied this naively, well,"},{"from":4015.81,"to":4018.93,"location":2,"content":"the problem is the number of spans in a piece of"},{"from":4018.93,"to":4023.05,"location":2,"content":"text is the square of the length of the text in words."},{"from":4023.05,"to":4026.4,"location":2,"content":"And so therefore, if you're making coreference decisions,"},{"from":4026.4,"to":4030.06,"location":2,"content":"which are between, um, pairs of spans,"},{"from":4030.06,"to":4033.06,"location":2,"content":"you've then got an algorithm that's, um,"},{"from":4033.06,"to":4035.41,"location":2,"content":"O- OT to the fourth,"},{"from":4035.41,"to":4038.03,"location":2,"content":"where the length of the text is T words."},{"from":4038.03,"to":4042.38,"location":2,"content":"So that's sort of really, really computationally impractical."},{"from":4042.38,"to":4043.51,"location":2,"content":"So at this point,"},{"from":4043.51,"to":4046.35,"location":2,"content":"they sort of say, well, actually,"},{"from":4046.35,"to":4049.91,"location":2,"content":"we do want to use our mouths a little and we want to work out"},{"from":4049.91,"to":4054.09,"location":2,"content":"how likely different things are to be mentions."},{"from":4054.09,"to":4058.65,"location":2,"content":"So effectively, um, then they're putting in a lot of pruning to"},{"from":4058.65,"to":4063.76,"location":2,"content":"decide which spans are actually things that they want to consider in their model."},{"from":4063.76,"to":4065.73,"location":2,"content":"And so at this point, in some sense,"},{"from":4065.73,"to":4067.17,"location":2,"content":"it's a little bit of a cheat, right?"},{"from":4067.17,"to":4070.44,"location":2,"content":"Because really this pruning step here is okay,"},{"from":4070.44,"to":4071.76,"location":2,"content":"we're going to stick in"},{"from":4071.76,"to":4074.2,"location":2,"content":"a mention detection module,"},{"from":4074.2,"to":4077.04,"location":2,"content":"um, just like a conventional system."},{"from":4077.04,"to":4081.22,"location":2,"content":"Um, but the prettiness of it is in terms of"},{"from":4081.22,"to":4085.44,"location":2,"content":"the algor- in terms of the loss function that's defined."},{"from":4085.44,"to":4089.73,"location":2,"content":"The loss function is really defined end to end from just a sequence of"},{"from":4089.73,"to":4094.38,"location":2,"content":"tokens through to the mention ranking decisions."},{"from":4094.38,"to":4098.2,"location":2,"content":"And so it is an end-to-end model,"},{"from":4098.2,"to":4100.32,"location":2,"content":"even though in practice to make it practical,"},{"from":4100.32,"to":4106.1,"location":2,"content":"you have to have something like a mention detector to get it to work."},{"from":4106.1,"to":4110.52,"location":2,"content":"Okay. Pause for breath. Um, yeah,"},{"from":4110.52,"to":4114.51,"location":2,"content":"so there's one last."},{"from":4114.51,"to":4120.18,"location":2,"content":"So we've done sort of mention pair model and mention ranking model."},{"from":4120.18,"to":4122.37,"location":2,"content":"Um, and so for both of those,"},{"from":4122.37,"to":4124.83,"location":2,"content":"you're just taking individual mentions and saying,"},{"from":4124.83,"to":4126.87,"location":2,"content":"here's another mention, what,"},{"from":4126.87,"to":4128.55,"location":2,"content":"what shall I do with it?"},{"from":4128.55,"to":4133.13,"location":2,"content":"Let's look at mentions and see if we're coreferent to each other."},{"from":4133.13,"to":4140.28,"location":2,"content":"And that there's no real concept of entities which are clusters of mentions."},{"from":4140.28,"to":4142.68,"location":2,"content":"You're just making these sort of one-off decisions"},{"from":4142.68,"to":4146.04,"location":2,"content":"between pairs of mentions, and somehow,"},{"from":4146.04,"to":4149.61,"location":2,"content":"sort of the entities as clusters just"},{"from":4149.61,"to":4154.05,"location":2,"content":"emerge as a consequence of those mention pair decisions."},{"from":4154.05,"to":4159.56,"location":2,"content":"So there's been this sort of long-standing feeling that,"},{"from":4159.56,"to":4162.57,"location":2,"content":"oh that can't really be right,"},{"from":4162.57,"to":4168.06,"location":2,"content":"the right way to do coreference must be really to do it as a clustering task,"},{"from":4168.06,"to":4170.04,"location":2,"content":"and people often refer to this as saying,"},{"from":4170.04,"to":4173.23,"location":2,"content":"we want entities as first-class citizens."},{"from":4173.23,"to":4174.6,"location":2,"content":"So we want to be,"},{"from":4174.6,"to":4180.3,"location":2,"content":"sort of putting together mentions into clusters that represent the entities."},{"from":4180.3,"to":4185.01,"location":2,"content":"And the obvious way to do that is to do a kind of bottom-up agglomerative clustering."},{"from":4185.01,"to":4186.9,"location":2,"content":"So you start off by saying,"},{"from":4186.9,"to":4189.85,"location":2,"content":"each mention is its own singleton cluster,"},{"from":4189.85,"to":4195.59,"location":2,"content":"and then you're making decisions to merge clu- clusters which is initially,"},{"from":4195.59,"to":4198.15,"location":2,"content":"um, saying two mentions are coreferent."},{"from":4198.15,"to":4199.69,"location":2,"content":"But as you go on with it,"},{"from":4199.69,"to":4204.28,"location":2,"content":"you're then making decisions that two clusters are coreferent or not."},{"from":4204.28,"to":4207.12,"location":2,"content":"So the idea here is you'll have a piece of text,"},{"from":4207.12,"to":4209.27,"location":2,"content":"Google recently blah blah blah blah,"},{"from":4209.27,"to":4212.06,"location":2,"content":"the company announced Google Plus, blah blah blah blah,"},{"from":4212.06,"to":4214.38,"location":2,"content":"the product features blah blah blah blah."},{"from":4214.38,"to":4217.17,"location":2,"content":"And so you have here some mentions."},{"from":4217.17,"to":4220.95,"location":2,"content":"And so what you're going to do is start off saying that okay,"},{"from":4220.95,"to":4224.44,"location":2,"content":"there are these four mentions that each their own cluster."},{"from":4224.44,"to":4226.17,"location":2,"content":"And then what we're gonna do,"},{"from":4226.17,"to":4228.52,"location":2,"content":"is we're going to make some decisions."},{"from":4228.52,"to":4232.74,"location":2,"content":"Um, so we might decide that these two clusters"},{"from":4232.74,"to":4237.38,"location":2,"content":"are coreferent and merge them into one cluster."},{"from":4237.38,"to":4241.97,"location":2,"content":"And then we might decide that these two,"},{"from":4241.97,"to":4248.1,"location":2,"content":"um, clusters are coreferent and merge them into one cluster."},{"from":4248.1,"to":4251.26,"location":2,"content":"And so we're progressively clustering."},{"from":4251.26,"to":4254.03,"location":2,"content":"And so then, we're going to look at these two clusters,"},{"from":4254.03,"to":4256.84,"location":2,"content":"cluster one and cluster two, and say,"},{"from":4256.84,"to":4260.59,"location":2,"content":"no we don't think those ones are coreferent,"},{"from":4260.59,"to":4263.03,"location":2,"content":"and therefore we're going to keep them apart."},{"from":4263.03,"to":4270.65,"location":2,"content":"And so your, your coreference algorithm stops when there's nothing left to merge."},{"from":4270.65,"to":4275.43,"location":2,"content":"And the reason why people think that this is the right thing to do is,"},{"from":4275.43,"to":4279.93,"location":2,"content":"the feeling is that if we sort of build partial clusters like this,"},{"from":4279.93,"to":4282.4,"location":2,"content":"that you'll be able to do a better job."},{"from":4282.4,"to":4284.04,"location":2,"content":"Because if I just sort of say,"},{"from":4284.04,"to":4285.61,"location":2,"content":"well here are two mentions,"},{"from":4285.61,"to":4287.46,"location":2,"content":"Google and Google Plus,"},{"from":4287.46,"to":4292.02,"location":2,"content":"should they be regarded as co- coreferent or not?"},{"from":4292.02,"to":4294.45,"location":2,"content":"Um, well, since you're smart human beings,"},{"from":4294.45,"to":4296.64,"location":2,"content":"and know what Google is and know what Google Plus is,"},{"from":4296.64,"to":4299.07,"location":2,"content":"of course you'll answer no, of course not."},{"from":4299.07,"to":4300.49,"location":2,"content":"Um, but, you know,"},{"from":4300.49,"to":4303.18,"location":2,"content":"if you're just a computer trying to make a decision,"},{"from":4303.18,"to":4305.32,"location":2,"content":"it's sort of hard to know the right answer,"},{"from":4305.32,"to":4309.24,"location":2,"content":"because there are lots of other cases when there are shortenings,"},{"from":4309.24,"to":4312.03,"location":2,"content":"where the right answer is that they're coreferent, right."},{"from":4312.03,"to":4316.2,"location":2,"content":"Because if this is being Google and Google Corp,"},{"from":4316.2,"to":4319.2,"location":2,"content":"then it would have been right to regard them as coreferent."},{"from":4319.2,"to":4321.44,"location":2,"content":"Or if it was sort of, um,"},{"from":4321.44,"to":4324.21,"location":2,"content":"something like Hillary Clinton and Hillary,"},{"from":4324.21,"to":4326.61,"location":2,"content":"it would have been right to regard them as coreferent."},{"from":4326.61,"to":4329.9,"location":2,"content":"So it can often be kind of hard to tell what's coreferent."},{"from":4329.9,"to":4331.83,"location":2,"content":"Um, but the hope is that,"},{"from":4331.83,"to":4334.98,"location":2,"content":"if you've made some of the easy decisions first,"},{"from":4334.98,"to":4337.83,"location":2,"content":"so if you decide Google and the company are coreferent"},{"from":4337.83,"to":4340.95,"location":2,"content":"and Google Plus and the product are coreferent,"},{"from":4340.95,"to":4344.58,"location":2,"content":"then it should be much easier to tell and to say,"},{"from":4344.58,"to":4345.99,"location":2,"content":"well product and company,"},{"from":4345.99,"to":4347.86,"location":2,"content":"they're definitely different things."},{"from":4347.86,"to":4351.51,"location":2,"content":"And therefore we should keep these things separate."},{"from":4351.51,"to":4354.4,"location":2,"content":"Um, and so that is the goal,"},{"from":4354.4,"to":4356.95,"location":2,"content":"and so to follow that goal,"},{"from":4356.95,"to":4359.16,"location":2,"content":"the kind of models people build."},{"from":4359.16,"to":4363.68,"location":2,"content":"And this was actually a model that Kevin Clark is one of the PhD students here,"},{"from":4363.68,"to":4366.08,"location":2,"content":"um, and we did a couple of years ago."},{"from":4366.08,"to":4367.41,"location":2,"content":"The idea was well,"},{"from":4367.41,"to":4369.35,"location":2,"content":"what we're going to do is,"},{"from":4369.35,"to":4372.86,"location":2,"content":"we're initially going to consider mentioned pairs,"},{"from":4372.86,"to":4377.18,"location":2,"content":"and build some kind of distributed, mention pair representation,"},{"from":4377.18,"to":4381.81,"location":2,"content":"which is kind of similar to what we were doing previously with the previous models."},{"from":4381.81,"to":4387.9,"location":2,"content":"But we're then going to go beyond that and come up with cluster representations."},{"from":4387.9,"to":4391.1,"location":2,"content":"And then we can look at cluster pair representations."},{"from":4391.1,"to":4396.05,"location":2,"content":"And we would hope that by looking at these cluster representations,"},{"from":4396.05,"to":4401.76,"location":2,"content":"we'll be able to make better decisions of what to merge or what next to merge."},{"from":4401.76,"to":4407.76,"location":2,"content":"Um, I have a few more slides that go through the Clark and Manning algorithm."},{"from":4407.76,"to":4410.49,"location":2,"content":"Um, but I also have just a few minutes left."},{"from":4410.49,"to":4413.68,"location":2,"content":"And so I think I'll skip the details."},{"from":4413.68,"to":4417.12,"location":2,"content":"Um, I think the main thing that's interesting here,"},{"from":4417.12,"to":4421.74,"location":2,"content":"is the idea of clustering based coreference algorithms,"},{"from":4421.74,"to":4423.47,"location":2,"content":"and why in principle,"},{"from":4423.47,"to":4425.49,"location":2,"content":"it should give you extra oomph."},{"from":4425.49,"to":4429.14,"location":2,"content":"Um, and that's sort of the main useful thing to get through."},{"from":4429.14,"to":4431.28,"location":2,"content":"Because what I want to make sure we have covered in"},{"from":4431.28,"to":4434.11,"location":2,"content":"the last few minutes that I've said nothing at all about,"},{"from":4434.11,"to":4438.63,"location":2,"content":"is how do you evaluate coreference resolution and how well does it work?"},{"from":4438.63,"to":4441.33,"location":2,"content":"So let me skip ahead to that."},{"from":4441.33,"to":4447.12,"location":2,"content":"Um, so if you look at coreference resolution papers,"},{"from":4447.12,"to":4448.68,"location":2,"content":"or something like that,"},{"from":4448.68,"to":4455.25,"location":2,"content":"um, there are many metrics that people have used to evaluate coreference,"},{"from":4455.25,"to":4457.68,"location":2,"content":"and they have a long alphabet soup of names."},{"from":4457.68,"to":4460.14,"location":2,"content":"So there's MUC, and CEAF, and LEA,"},{"from":4460.14,"to":4462.51,"location":2,"content":"and B- CUBED, and BLANC and,"},{"from":4462.51,"to":4464.01,"location":2,"content":"um, things like that."},{"from":4464.01,"to":4469.08,"location":2,"content":"Um, so effectively part of it is that if you look in the clustering literature,"},{"from":4469.08,"to":4472.22,"location":2,"content":"there are lots of ways that people try and evaluate clustering,"},{"from":4472.22,"to":4476.61,"location":2,"content":"and essentially any of those metrics and some other ones, you can, um,"},{"from":4476.61,"to":4481.59,"location":2,"content":"port over, um, to, um, coreference evaluation."},{"from":4481.59,"to":4485.9,"location":2,"content":"I mean, why it's kind of difficult is the situation you have,"},{"from":4485.9,"to":4489.9,"location":2,"content":"is that you have a gold standard which picks out certain clusters,"},{"from":4489.9,"to":4492.89,"location":2,"content":"and the system picks out certain clusters,"},{"from":4492.89,"to":4498,"location":2,"content":"and you get some result like this and you have to decide how good it is."},{"from":4498,"to":4501.38,"location":2,"content":"So I'm going to show you just quickly one particular algorithm."},{"from":4501.38,"to":4504.63,"location":2,"content":"So the B-CUBED algorithm uses"},{"from":4504.63,"to":4508.73,"location":2,"content":"precision and recall and F-measure like we thought of before."},{"from":4508.73,"to":4511.29,"location":2,"content":"So it looks at, uh,"},{"from":4511.29,"to":4513.87,"location":2,"content":"cluster identified by the system."},{"from":4513.87,"to":4519.1,"location":2,"content":"And it says, well this cluster is four-fifths,"},{"from":4519.1,"to":4520.89,"location":2,"content":"um, gold cluster one,"},{"from":4520.89,"to":4523.47,"location":2,"content":"so the precision is four-fifths."},{"from":4523.47,"to":4528.24,"location":2,"content":"But actually, um, there are six things in gold cluster one."},{"from":4528.24,"to":4533.76,"location":2,"content":"So it only has a recall of four-sixth of that cluster."},{"from":4533.76,"to":4536.98,"location":2,"content":"And then it similarly does for the other one,"},{"from":4536.98,"to":4539.44,"location":2,"content":"the same kind of calculation."},{"from":4539.44,"to":4543.99,"location":2,"content":"And then it's going to average across the precisions and recalls,"},{"from":4543.99,"to":4549.05,"location":2,"content":"um, and it's going to come up with an overall, um, B-CUBED score."},{"from":4549.05,"to":4554.4,"location":2,"content":"Um, in- if you think about this from an algorithm's perspective,"},{"from":4554.4,"to":4557.2,"location":2,"content":"this is actually tricky because I sort of said,"},{"from":4557.2,"to":4560.46,"location":2,"content":"um, okay, this cluster is mainly gold cluster one."},{"from":4560.46,"to":4563.73,"location":2,"content":"So use that as its reference,"},{"from":4563.73,"to":4566.82,"location":2,"content":"but that means you have to do a bipartite graph alignment"},{"from":4566.82,"to":4569.52,"location":2,"content":"between system clusters, and gold clusters."},{"from":4569.52,"to":4572.95,"location":2,"content":"So hidden in- hidden inside this evaluation,"},{"from":4572.95,"to":4576.21,"location":2,"content":"um, system is actually an NP-complete problem."},{"from":4576.21,"to":4579.52,"location":2,"content":"But in practice you can normally do it heuristically well enough,"},{"from":4579.52,"to":4581.61,"location":2,"content":"that the evaluation method, um,"},{"from":4581.61,"to":4583.24,"location":2,"content":"runs and works."},{"from":4583.24,"to":4585.44,"location":2,"content":"Um, okay."},{"from":4585.44,"to":4588.21,"location":2,"content":"And so the kind of thing to notice is that,"},{"from":4588.21,"to":4589.97,"location":2,"content":"if you under cluster,"},{"from":4589.97,"to":4592.35,"location":2,"content":"you automatically get great precision,"},{"from":4592.35,"to":4594.14,"location":2,"content":"but you get bad recall."},{"from":4594.14,"to":4595.81,"location":2,"content":"And if you over cluster,"},{"from":4595.81,"to":4600.4,"location":2,"content":"you get- get great recall because everything that should be in the same cluster is,"},{"from":4600.4,"to":4602.91,"location":2,"content":"um, but you get terrible precision."},{"from":4602.91,"to":4608.18,"location":2,"content":"And so what you want to be doing is balancing those two things."},{"from":4608.18,"to":4611.28,"location":2,"content":"Okay. Last two minutes,"},{"from":4611.28,"to":4613.38,"location":2,"content":"just to give you some idea of performance."},{"from":4613.38,"to":4618.28,"location":2,"content":"So these are results from the OntoNotes dataset which is about 3,000 documents."},{"from":4618.28,"to":4621.49,"location":2,"content":"Chinese, English, labeled for coreference."},{"from":4621.49,"to":4625.98,"location":2,"content":"Um, the scores I'm reporting is actually an average over three metrics."},{"from":4625.98,"to":4629.3,"location":2,"content":"One of which is the one I just showed you for B-CUBED,"},{"from":4629.3,"to":4631.97,"location":2,"content":"um, here are some numbers."},{"from":4631.97,"to":4637.24,"location":2,"content":"Um, so Lee et al 2010 was the Stanford system."},{"from":4637.24,"to":4641.4,"location":2,"content":"So there- there was this shared task evaluation of coreference systems."},{"from":4641.4,"to":4644.27,"location":2,"content":"And we believe that Jerry Hobbs, um,"},{"from":4644.27,"to":4648.89,"location":2,"content":"was still right, and you could do fine with rule-based coreference."},{"from":4648.89,"to":4650.89,"location":2,"content":"And so in 2010,"},{"from":4650.89,"to":4656.03,"location":2,"content":"we managed to beat all machine learning systems with a rule-based coreference system,"},{"from":4656.03,"to":4657.73,"location":2,"content":"and we were proud of it."},{"from":4657.73,"to":4660.45,"location":2,"content":"Um, and that's its performance right here."},{"from":4660.45,"to":4662.24,"location":2,"content":"Um, in subsequent years,"},{"from":4662.24,"to":4665.15,"location":2,"content":"people did start to do a bit better, um,"},{"from":4665.15,"to":4670.11,"location":2,"content":"with, um, with, uh, machine learning systems."},{"from":4670.11,"to":4671.82,"location":2,"content":"But as you see, not very much,"},{"from":4671.82,"to":4677.04,"location":2,"content":"right for these 2012 systems that this one's somewhat,"},{"from":4677.04,"to":4678.93,"location":2,"content":"better this one really wasn't better,"},{"from":4678.93,"to":4682.34,"location":2,"content":"um, this, um, but making a bit of progress."},{"from":4682.34,"to":4687.98,"location":2,"content":"Starting in 2015, there started to be neural systems."},{"from":4687.98,"to":4691.2,"location":2,"content":"Um, so Wiseman et al was sort of the first neural system,"},{"from":4691.2,"to":4694.24,"location":2,"content":"I vaguely mentioned this Clark & Manning system,"},{"from":4694.24,"to":4696.94,"location":2,"content":"and the numbers are going up into the mid-sixties."},{"from":4696.94,"to":4701.35,"location":2,"content":"And this is the Kenton Lee system that has the end-to-end neural coreference,"},{"from":4701.35,"to":4703.81,"location":2,"content":"and on English is getting about 67."},{"from":4703.81,"to":4705.8,"location":2,"content":"So something you'll notice from this,"},{"from":4705.8,"to":4708.07,"location":2,"content":"is the numbers aren't great."},{"from":4708.07,"to":4711.42,"location":2,"content":"So coreference is still far from a solved problem."},{"from":4711.42,"to":4713.79,"location":2,"content":"Um, so if you want to have a bit of fun, um,"},{"from":4713.79,"to":4717.45,"location":2,"content":"you can go out and try coreference systems for yourself."},{"from":4717.45,"to":4721.47,"location":2,"content":"Um, there's a Stanford one on the first link or the one from Hugging Face"},{"from":4721.47,"to":4724.97,"location":2,"content":"is a good modern coreference system as well."},{"from":4724.97,"to":4727.74,"location":2,"content":"And if you just try these out with some pieces of text,"},{"from":4727.74,"to":4730.38,"location":2,"content":"you'll notice they still get lots of things wrong."},{"from":4730.38,"to":4732.48,"location":2,"content":"Um, so there's still more work to do,"},{"from":4732.48,"to":4735.27,"location":2,"content":"because this is just a harder language understanding task,"},{"from":4735.27,"to":4737.53,"location":2,"content":"[NOISE] which is just kind of like, um,"},{"from":4737.53,"to":4741.27,"location":2,"content":"Jerry Hobbs and Terry- Terry Winograd earlier observed."},{"from":4741.27,"to":4744.68,"location":2,"content":"Okay, um, but I'll stop there for now. Thanks a lot."},{"from":4744.68,"to":4749.74,"location":2,"content":"Um, oh yeah, I should have a reminder, invited speaker next Tuesday."},{"from":4749.74,"to":4751.57,"location":2,"content":"Um, so I'll be taking,"},{"from":4751.57,"to":4754.72,"location":2,"content":"um, attendance for invited speakers."}]} \ No newline at end of file diff --git a/bcc-en/17.bcc b/bcc-en/17.bcc new file mode 100644 index 0000000000000000000000000000000000000000..3f9443e908ae167ab05c4a3118351f74242b8c3e --- /dev/null +++ b/bcc-en/17.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.19,"to":8.79,"location":2,"content":"So today, we're very pleased to have as our second, um,"},{"from":8.79,"to":10.98,"location":2,"content":"invited speaker, Richard Socher,"},{"from":10.98,"to":14.09,"location":2,"content":"he is the chief scientist at Salesforce."},{"from":14.09,"to":18.04,"location":2,"content":"Um, Richard actually also has a lot more connection to this class,"},{"from":18.04,"to":21.84,"location":2,"content":"um, because, um, for several years, um,"},{"from":21.84,"to":24.97,"location":2,"content":"Richard was involved either as instructor or, um,"},{"from":24.97,"to":29.11,"location":2,"content":"co-instructor in teaching this material at Stanford,"},{"from":29.11,"to":32.6,"location":2,"content":"um, so he sort of knows the course, um, pretty well."},{"from":32.6,"to":34.03,"location":2,"content":"Um, and so today,"},{"from":34.03,"to":38.83,"location":2,"content":"he's going to be talking about some of the challenges and recent work"},{"from":38.83,"to":43.69,"location":2,"content":"in doing multitask learning in natural language processing. So welcome, Richard."},{"from":43.69,"to":46.59,"location":2,"content":"Thank you. Hello, everybody. I'm excited to be here."},{"from":46.59,"to":49.38,"location":2,"content":"Uh, yeah, I want to talk to you today about what we,"},{"from":49.38,"to":51.28,"location":2,"content":"in short, called decaNLP."},{"from":51.28,"to":54.63,"location":2,"content":"I want to first give a big shout out to Bryan McCann."},{"from":54.63,"to":56.9,"location":2,"content":"He's the first author of this, uh, paper,"},{"from":56.9,"to":60.2,"location":2,"content":"and I've pitched this idea to a lot of people in the last, like,"},{"from":60.2,"to":61.28,"location":2,"content":"three to four years,"},{"from":61.28,"to":62.41,"location":2,"content":"and most people were like,"},{"from":62.41,"to":64.73,"location":2,"content":"\"This is too much pre-processing because you're trying to"},{"from":64.73,"to":67.29,"location":2,"content":"do 10 different tasks in one model.\""},{"from":67.29,"to":69.51,"location":2,"content":"That's sort of where the decathlon, uh,"},{"from":69.51,"to":71.81,"location":2,"content":"wording comes in, uh, but he,"},{"from":71.81,"to":73.31,"location":2,"content":"he really stuck to it, uh,"},{"from":73.31,"to":76.73,"location":2,"content":"did all the pre-processing and all the things that you now know like tokenization,"},{"from":76.73,"to":78.5,"location":2,"content":"and it turns out a lot of different data sets,"},{"from":78.5,"to":80.27,"location":2,"content":"have a different conception of what a word is."},{"from":80.27,"to":81.71,"location":2,"content":"This wasn't two words,"},{"from":81.71,"to":83.48,"location":2,"content":"uh, or one word,"},{"from":83.48,"to":85.36,"location":2,"content":"and things like that, and that changes how you"},{"from":85.36,"to":87.47,"location":2,"content":"write all your evaluation scripts and all of that."},{"from":87.47,"to":89.17,"location":2,"content":"So Bryan, uh, is,"},{"from":89.17,"to":90.77,"location":2,"content":"is a really phenomenal researcher,"},{"from":90.77,"to":91.98,"location":2,"content":"uh, with us in the group,"},{"from":91.98,"to":95.34,"location":2,"content":"and Nitish has helped us a lot on the optimization side of this,"},{"from":95.34,"to":96.48,"location":2,"content":"uh, and then Caiming Xiong,"},{"from":96.48,"to":98.42,"location":2,"content":"the Director of Research, has done a lot of, uh,"},{"from":98.42,"to":101.73,"location":2,"content":"really phenomenal work that's kind of helpful in pretty much all our projects."},{"from":101.73,"to":104.83,"location":2,"content":"So I'm going to tell you a couple of different, uh,"},{"from":104.83,"to":108.56,"location":2,"content":"lines of reasoning that led us to,"},{"from":108.56,"to":110.53,"location":2,"content":"uh, this idea of multitask learning."},{"from":110.53,"to":114.17,"location":2,"content":"And the first one was sort of trying to take a step back and looking at the field,"},{"from":114.17,"to":118.95,"location":2,"content":"and I noticed not like that much of a historical class but basically pre-2010,"},{"from":118.95,"to":124.34,"location":2,"content":"most natural language processing had kind of these very hand-designed features,"},{"from":124.34,"to":125.66,"location":2,"content":"and we basically just had,"},{"from":125.66,"to":128.71,"location":2,"content":"uh, machine learning kind of learned weights,"},{"from":128.71,"to":132.68,"location":2,"content":"uh, in the optimization procedure for these human-designed features."},{"from":132.68,"to":140.78,"location":2,"content":"And so in 2010, Chris and I and others sort of started to work in deep learning for feature learning."},{"from":140.78,"to":142.15,"location":2,"content":"So everything was a word vector and now,"},{"from":142.15,"to":145.91,"location":2,"content":"we can back-propagate into them and actually learn those representations."},{"from":145.91,"to":147.41,"location":2,"content":"And I think currently,"},{"from":147.41,"to":148.88,"location":2,"content":"we're kind of in a state where we do a lot of"},{"from":148.88,"to":151.94,"location":2,"content":"deep architecture engineering for specific tasks,"},{"from":151.94,"to":153.11,"location":2,"content":"and you've seen this already."},{"from":153.11,"to":154.7,"location":2,"content":"You have like an NER model,"},{"from":154.7,"to":156.35,"location":2,"content":"you have a question and answering model,"},{"from":156.35,"to":157.75,"location":2,"content":"you have a translation model,"},{"from":157.75,"to":159.11,"location":2,"content":"and we basically now,"},{"from":159.11,"to":161.99,"location":2,"content":"each of these communities has at least, uh,"},{"from":161.99,"to":164.66,"location":2,"content":"converged on is probably some kind of neural network,"},{"from":164.66,"to":167.57,"location":2,"content":"but there's still a lot of different kinds of architectures of"},{"from":167.57,"to":171.04,"location":2,"content":"these neural networks that you're working on for each different task."},{"from":171.04,"to":172.59,"location":2,"content":"And so the question is like, okay,"},{"from":172.59,"to":174.13,"location":2,"content":"we're gonna probably do that for"},{"from":174.13,"to":177.17,"location":2,"content":"another couple of years because we're making good progress,"},{"from":177.17,"to":178.55,"location":2,"content":"but what's sort of next,"},{"from":178.55,"to":179.99,"location":2,"content":"uh, on the research side?"},{"from":179.99,"to":182.48,"location":2,"content":"And what I actually love about this class so much is that"},{"from":182.48,"to":185,"location":2,"content":"you go from like maybe not knowing much about NLP at"},{"from":185,"to":187.72,"location":2,"content":"all to you can basically understand"},{"from":187.72,"to":190.88,"location":2,"content":"the state-of-the-art research papers as they come out now,"},{"from":190.88,"to":192.95,"location":2,"content":"uh, and this, this is one of those."},{"from":192.95,"to":195.48,"location":2,"content":"Uh, so [NOISE] why,"},{"from":195.48,"to":197.84,"location":2,"content":"why not continue to work in this multitask regime?"},{"from":197.84,"to":199.28,"location":2,"content":"In some ways, I feel like, uh,"},{"from":199.28,"to":200.96,"location":2,"content":"the community is a little bit, uh,"},{"from":200.96,"to":202.7,"location":2,"content":"like this cute dog, where we, kind of,"},{"from":202.7,"to":205.96,"location":2,"content":"randomly restart, uh, after every project."},{"from":205.96,"to":209.84,"location":2,"content":"And it's kind of clear to me that if you have a lot of training data, uh,"},{"from":209.84,"to":214.92,"location":2,"content":"and you define a specific data set and task on that data set,"},{"from":214.92,"to":219.08,"location":2,"content":"you start to architecture engineer in your model to hill-climb on a particular metric,"},{"from":219.08,"to":221.42,"location":2,"content":"or leaderboard, or publications,"},{"from":221.42,"to":223.66,"location":2,"content":"or products, or whatever it is, uh,"},{"from":223.66,"to":225.71,"location":2,"content":"then as long as your data set has"},{"from":225.71,"to":228.09,"location":2,"content":"roughly a good representative set of"},{"from":228.09,"to":230.88,"location":2,"content":"1,000 times the number of output classes that you have,"},{"from":230.88,"to":236.21,"location":2,"content":"you'll probably get it into a regi- regime where you're in the 80 to 90 percent accuracy,"},{"from":236.21,"to":239.36,"location":2,"content":"or if one, where you're basically doing pretty okay."},{"from":239.36,"to":242.3,"location":2,"content":"And of course, now when you look at trends on ImageNet,"},{"from":242.3,"to":245,"location":2,"content":"you have 1,000 different classes in computer vision,"},{"from":245,"to":248.64,"location":2,"content":"1,000 different classes, each has 1,000 images."},{"from":248.64,"to":251.46,"location":2,"content":"So if you have roughly a million images, you do pretty well."},{"from":251.46,"to":253.74,"location":2,"content":"And in machine translation, ideally,"},{"from":253.74,"to":256.25,"location":2,"content":"you know, I have many more, I have like hundreds of thousands of words,"},{"from":256.25,"to":261.73,"location":2,"content":"so you want many millions of examples of each of the word in their,"},{"from":261.73,"to":263.09,"location":2,"content":"uh, words in their context."},{"from":263.09,"to":264.83,"location":2,"content":"And of course, you know, that the caveat is"},{"from":264.83,"to":267.62,"location":2,"content":"machine translation doesn't work to the level of humans,"},{"from":267.62,"to":270.11,"location":2,"content":"but it works well enough to have it at least in products,"},{"from":270.11,"to":274.75,"location":2,"content":"and even the best human translators use it as sort of a pre-translation and then,"},{"from":274.75,"to":277.03,"location":2,"content":"uh, sort of, clean it up."},{"from":277.03,"to":279.99,"location":2,"content":"And so it's also clear to me that in this regime,"},{"from":279.99,"to":281.48,"location":2,"content":"and if we want to get to, sort of,"},{"from":281.48,"to":283.55,"location":2,"content":"more general AI features, uh,"},{"from":283.55,"to":287.36,"location":2,"content":"we need to have some kind of more continuous learning of a single model."},{"from":287.36,"to":289.84,"location":2,"content":"Because if we keep restarting at every project,"},{"from":289.84,"to":291.83,"location":2,"content":"we're never going to get to a single model that, kind of,"},{"from":291.83,"to":295.71,"location":2,"content":"encompasses more and more of the complexity of natural language."},{"from":295.71,"to":299.12,"location":2,"content":"And, uh, when I say we start from random,"},{"from":299.12,"to":301.3,"location":2,"content":"you of course know that that's not quite true"},{"from":301.3,"to":304.19,"location":2,"content":"because we do have some things that we pre-train,"},{"from":304.19,"to":306.29,"location":2,"content":"namely word vectors, and in computer vision,"},{"from":306.29,"to":307.52,"location":2,"content":"we have even more things."},{"from":307.52,"to":309.02,"location":2,"content":"And so in some ways that is, ah,"},{"from":309.02,"to":311.75,"location":2,"content":"an aspiring ideal for NLP,"},{"from":311.75,"to":313.86,"location":2,"content":"because in computer vision, you would be, kind of,"},{"from":313.86,"to":315.59,"location":2,"content":"crazy to not use some kind of"},{"from":315.59,"to":319.61,"location":2,"content":"convolution neural network that has pre-train- has been pre-trained on some kind of"},{"from":319.61,"to":322.52,"location":2,"content":"tasks like ImageNet when you start with your project and"},{"from":322.52,"to":325.99,"location":2,"content":"try to classify objects or do object detection and a lot of other things."},{"from":325.99,"to":329.75,"location":2,"content":"And in some ways that the whole community could get behind it very quickly,"},{"from":329.75,"to":332.46,"location":2,"content":"because I mean, you know, once it worked, uh,"},{"from":332.46,"to":334.13,"location":2,"content":"reasonably well, because there was a, sort of,"},{"from":334.13,"to":335.99,"location":2,"content":"single blocking task in computer vision."},{"from":335.99,"to":338.61,"location":2,"content":"If you can't even tell apart a dog from a cat from a house,"},{"from":338.61,"to":342.42,"location":2,"content":"it doesn't really make sense to think of even larger, uh, vision projects."},{"from":342.42,"to":345.21,"location":2,"content":"And in NLP, we've had a lot of success with word vectors,"},{"from":345.21,"to":346.65,"location":2,"content":"you know a lot of those now,"},{"from":346.65,"to":348.75,"location":2,"content":"and it started for, sort of, just a small, uh,"},{"from":348.75,"to":351.78,"location":2,"content":"window-based approach or Word2Vec and GloVe, uh,"},{"from":351.78,"to":355.02,"location":2,"content":"then we had, uh, context vectors that were trained, uh,"},{"from":355.02,"to":357.3,"location":2,"content":"on machine translation, but basically,"},{"from":357.3,"to":360.05,"location":2,"content":"instead of just having a single set of words,"},{"from":360.05,"to":364.45,"location":2,"content":"we actually pre-trained some of the NLSTMs that came on top of those word vectors,"},{"from":364.45,"to":366.93,"location":2,"content":"and, uh, the way we train that, uh,"},{"from":366.93,"to":369.05,"location":2,"content":"was also actually Bryan McCann's paper on"},{"from":369.05,"to":372.53,"location":2,"content":"contextual vectors with machine translation and then ELMo,"},{"from":372.53,"to":376.26,"location":2,"content":"kind of, replaced machine translation with, uh, language modeling,"},{"from":376.26,"to":378.57,"location":2,"content":"which of course is even better because there's even more training data,"},{"from":378.57,"to":380.34,"location":2,"content":"and it still tells you a lot, uh,"},{"from":380.34,"to":383.21,"location":2,"content":"and kind of captures in some ways a more complex version of"},{"from":383.21,"to":386.9,"location":2,"content":"distributional sort of hypotheses that we had in simpler word vectors,"},{"from":386.9,"to":389.64,"location":2,"content":"and BERT, not quite a language model but also, kind of,"},{"from":389.64,"to":391.61,"location":2,"content":"trying to predict words in their context, uh,"},{"from":391.61,"to":394.4,"location":2,"content":"but pre-training a lot more layers and a lot deeper networks."},{"from":394.4,"to":399.7,"location":2,"content":"And so we see the success of pre-training a certain set of weights."},{"from":399.7,"to":401.26,"location":2,"content":"And so the question is,"},{"from":401.26,"to":404.31,"location":2,"content":"why not try to pre-train the entire model?"},{"from":404.31,"to":406.65,"location":2,"content":"As in including your output,"},{"from":406.65,"to":410.14,"location":2,"content":"your softmax, your pointer mechanisms and everything,"},{"from":410.14,"to":414.24,"location":2,"content":"and then just taking a completely pre-trained model and trying to do something,"},{"from":414.24,"to":416.89,"location":2,"content":"and that is, kind of, the goal that we have."},{"from":416.89,"to":418.89,"location":2,"content":"And so, uh, we, sort of,"},{"from":418.89,"to":420.52,"location":2,"content":"ask ourselves why hasn't this happened?"},{"from":420.52,"to":421.74,"location":2,"content":"Why are we, you know,"},{"from":421.74,"to":423.43,"location":2,"content":"the first to think about, like,"},{"from":423.43,"to":425.81,"location":2,"content":"trying to pre-train the entirety of the model,"},{"from":425.81,"to":427.37,"location":2,"content":"the encoders, and decoders,"},{"from":427.37,"to":428.42,"location":2,"content":"and outputs, and everything."},{"from":428.42,"to":432.74,"location":2,"content":"Uh, and I think part of it is that NLP requires a lot of different kinds of reasoning."},{"from":432.74,"to":434.42,"location":2,"content":"You've seen many of them already."},{"from":434.42,"to":438.29,"location":2,"content":"You have some logical reasoning like 550 people in this room,"},{"from":438.29,"to":440.3,"location":2,"content":"25 leave, are there still people in the room,"},{"from":440.3,"to":442.79,"location":2,"content":"and you logically can answer that question,"},{"from":442.79,"to":445.93,"location":2,"content":"and you have lots of different kinds of linguistic and emotional reasoning,"},{"from":445.93,"to":447.47,"location":2,"content":"sentiment analysis, you know,"},{"from":447.47,"to":450.14,"location":2,"content":"this is a typical Nicolas Cage movie and then you need to know that that's a"},{"from":450.14,"to":453.59,"location":2,"content":"probably negative review unless you like Nicolas Cage movies."},{"from":453.59,"to":456.47,"location":2,"content":"Um, no judgment. And, uh,"},{"from":456.47,"to":458.18,"location":2,"content":"you know, visual types of reasoning and so on."},{"from":458.18,"to":461.45,"location":2,"content":"And so I think partly because of that complexity in the beginning to feel,"},{"from":461.45,"to":466.58,"location":2,"content":"didn't really make much progress and now and then kind of separate it."},{"from":466.58,"to":470.68,"location":2,"content":"And I think in some cases, kind of artificially separated into all these separate tasks,"},{"from":470.68,"to":472.34,"location":2,"content":"like you have named entity recognition,"},{"from":472.34,"to":475.8,"location":2,"content":"part of speech tagging, and semantic role labeling and, and so on."},{"from":475.8,"to":478.56,"location":2,"content":"And, and in some ways- and it sounds kind of snarky but,"},{"from":478.56,"to":479.99,"location":2,"content":"you know, it made a lot of sense at the time,"},{"from":479.99,"to":482.54,"location":2,"content":"and it allowed us to make a lot of progress in the community,"},{"from":482.54,"to":484.85,"location":2,"content":"but basically we started chasing these benchmarks,"},{"from":484.85,"to":486.29,"location":2,"content":"and all these different communities, kind of,"},{"from":486.29,"to":488.61,"location":2,"content":"started going off in their own ways."},{"from":488.61,"to":490.32,"location":2,"content":"And we even have some communities that say,"},{"from":490.32,"to":491.95,"location":2,"content":"\"We do general question answering,"},{"from":491.95,"to":494.99,"location":2,"content":"and there's literally workshops on general question answering, and when I asked,"},{"from":494.99,"to":498.35,"location":2,"content":"uh, the organizers, \"Can I ask your model what the sentiment is of this tweet?\""},{"from":498.35,"to":501.24,"location":2,"content":"They're like, \"No, that's sentiment analysis. Go to that different workshop."},{"from":501.24,"to":502.51,"location":2,"content":"It's down, down the hall.\""},{"from":502.51,"to":504.27,"location":2,"content":"But I'm like, \"That's a- that's a question."},{"from":504.27,"to":507.33,"location":2,"content":"Why can't you answer it in the general question answering workshop?\""},{"from":507.33,"to":509.94,"location":2,"content":"Um, and so a lot of people then say,"},{"from":509.94,"to":511.54,"location":2,"content":"\"Well, if you want to work on more general stuff,"},{"from":511.54,"to":513.86,"location":2,"content":"it has to be an unsupervised, kind of,"},{"from":513.86,"to":516.7,"location":2,"content":"task and the, the feature will not be supervised.\""},{"from":516.7,"to":520.49,"location":2,"content":"I don't think NLP will be completely unsupervised,"},{"from":520.49,"to":522.83,"location":2,"content":"and we won't solve it, uh, completely unsupervised,"},{"from":522.83,"to":525.41,"location":2,"content":"because in the end, language has a lot of supervision for people,"},{"from":525.41,"to":529.02,"location":2,"content":"uh, and, uh, I think for, for systems also."},{"from":529.02,"to":532.62,"location":2,"content":"Uh, and you won't, you know,"},{"from":532.62,"to":534.6,"location":2,"content":"if you have- there's a child and it's in a jungle,"},{"from":534.6,"to":537.29,"location":2,"content":"it will probably develop a pretty good visual cortex by itself,"},{"from":537.29,"to":539.37,"location":2,"content":"but it won't develop language by itself."},{"from":539.37,"to":541.23,"location":2,"content":"And then- and then also, like,"},{"from":541.23,"to":543.72,"location":2,"content":"I think if you'll just allow AI's to talk to one another,"},{"from":543.72,"to":546.2,"location":2,"content":"it makes very little sense for them to try to come up with as"},{"from":546.2,"to":549.14,"location":2,"content":"inefficient of a communication protocol as humans have with, you know,"},{"from":549.14,"to":553.97,"location":2,"content":"sequential processing of language because algorithms and computers could,"},{"from":553.97,"to":556.07,"location":2,"content":"if there's no supervision of human language,"},{"from":556.07,"to":559.46,"location":2,"content":"they could just communicate in much more efficient ways with one another."},{"from":559.46,"to":561.05,"location":2,"content":"So I think it's fairly clear,"},{"from":561.05,"to":564.49,"location":2,"content":"we need a lot of supervision, uh, in NLP."},{"from":564.49,"to":567.84,"location":2,"content":"And so basically, all of this has led us, uh,"},{"from":567.84,"to":574.34,"location":2,"content":"to trying to think about a unified multitask model for a lot of different NLP tasks."},{"from":574.34,"to":576.51,"location":2,"content":"By the way, if you have any questions, just raise your hand."},{"from":576.51,"to":579.11,"location":2,"content":"Okay, let's make this very interactive."},{"from":579.11,"to":582.55,"location":2,"content":"Um, basically, we want this unified model, uh,"},{"from":582.55,"to":585.57,"location":2,"content":"to decide how to transfer knowledge,"},{"from":585.57,"to":587.88,"location":2,"content":"uh, and not have it, sort of, be manually assigned."},{"from":587.88,"to":589.28,"location":2,"content":"Like in most cases,"},{"from":589.28,"to":590.87,"location":2,"content":"when you assign your project you say, \"Oh,"},{"from":590.87,"to":595.03,"location":2,"content":"well I know that named entity recognition part of speech tagging help each other."},{"from":595.03,"to":596.87,"location":2,"content":"Because once you know something is a noun,"},{"from":596.87,"to":600.73,"location":2,"content":"then it's more likely that it's also a named entity.\""},{"from":600.73,"to":605.09,"location":2,"content":"And in this case, we want to basically allow for the single unified model"},{"from":605.09,"to":609.89,"location":2,"content":"to know itself how to do domain adaptation and wha- how to share the weights,"},{"from":609.89,"to":612.65,"location":2,"content":"and that will hopefully then lead to a lot of,"},{"from":612.65,"to":615.93,"location":2,"content":"uh, transfer learning and zero shot learning capabilities."},{"from":615.93,"to":619.1,"location":2,"content":"I also think that if we get to this, sort of,"},{"from":619.1,"to":623.26,"location":2,"content":"hard goal of having a single fa- single unified multitask model,"},{"from":623.26,"to":627.14,"location":2,"content":"then we'll easy- be able to more easily adapt it to"},{"from":627.14,"to":631.09,"location":2,"content":"new tasks and we'll be also able to deploy it in production more quickly."},{"from":631.09,"to":632.4,"location":2,"content":"If nowadays you want to build"},{"from":632.4,"to":635.57,"location":2,"content":"a little squirrel detector and connect it to your sprinkler system,"},{"from":635.57,"to":637.89,"location":2,"content":"you can just download some off-the-shelf software,"},{"from":637.89,"to":640.2,"location":2,"content":"and it will basically, kind of, work."},{"from":640.2,"to":642.17,"location":2,"content":"That is not the case if you try to do"},{"from":642.17,"to":644.39,"location":2,"content":"a pretty complex language project where you"},{"from":644.39,"to":646.96,"location":2,"content":"want to translate into some completely new language or,"},{"from":646.96,"to":650.24,"location":2,"content":"you know, analyze some website and then do something else afterwards."},{"from":650.24,"to":651.89,"location":2,"content":"So, uh, you also,"},{"from":651.89,"to":656.37,"location":2,"content":"when you actually try to deploy and use these kinds of tools and companies,"},{"from":656.37,"to":659.08,"location":2,"content":"you'll realize that there are a lot of different kinds of groups."},{"from":659.08,"to":660.2,"location":2,"content":"There's the search group,"},{"from":660.2,"to":661.31,"location":2,"content":"and the chatbot team,"},{"from":661.31,"to":662.54,"location":2,"content":"and the translation team,"},{"from":662.54,"to":665.93,"location":2,"content":"and, uh, and the social sentiment analysis team,"},{"from":665.93,"to":667.1,"location":2,"content":"and they all use different models,"},{"from":667.1,"to":668.39,"location":2,"content":"and they all deploy different models,"},{"from":668.39,"to":670.85,"location":2,"content":"and they all have to build a lot of overhead into"},{"from":670.85,"to":675.15,"location":2,"content":"the core of the- or around that core of an AI model."},{"from":675.15,"to":678.24,"location":2,"content":"So basically, um, lastly,"},{"from":678.24,"to":680.43,"location":2,"content":"it was, sort of, what we had with, with this dog."},{"from":680.43,"to":682.17,"location":2,"content":"I think that once we have this unified model,"},{"from":682.17,"to":684.38,"location":2,"content":"it will also be a first step to being able to"},{"from":684.38,"to":686.87,"location":2,"content":"then continually learn this and just have a single model that just"},{"from":686.87,"to":688.88,"location":2,"content":"gets better and better over time and starts"},{"from":688.88,"to":692.03,"location":2,"content":"to capture more and more of the complexity of language."},{"from":692.03,"to":693.98,"location":2,"content":"All right, any questions around, sort of,"},{"from":693.98,"to":701.7,"location":2,"content":"the motivation high level?"},{"from":701.7,"to":704.86,"location":2,"content":"All right. So then, uh,"},{"from":704.86,"to":708.37,"location":2,"content":"it's sort of the question, how do we actually make that happen?"},{"from":708.37,"to":712.13,"location":2,"content":"And then we -- I first sort of sat down and looked at, like,"},{"from":712.13,"to":716.56,"location":2,"content":"the general sort of formats of all the tasks that you may experience in"},{"from":716.56,"to":718.51,"location":2,"content":"this class and that NLP sort of has as a field in"},{"from":718.51,"to":721,"location":2,"content":"general and I think they can broadly classified,"},{"from":721,"to":723.1,"location":2,"content":"be classified into these three different categories."},{"from":723.1,"to":724.9,"location":2,"content":"Sequence tagging, you already know."},{"from":724.9,"to":727.84,"location":2,"content":"Things like NER or aspect-specific sentiment or in"},{"from":727.84,"to":732.25,"location":2,"content":"a specific context we want to classify if a word is positive or negative."},{"from":732.25,"to":734.38,"location":2,"content":"Uh, and then text classification,"},{"from":734.38,"to":737.29,"location":2,"content":"just a single label for the entire piece of text"},{"from":737.29,"to":740.34,"location":2,"content":"and then sequence the sequence a lot of different, you know,"},{"from":740.34,"to":743.58,"location":2,"content":"problems fall into that and I actually personally love, uh,"},{"from":743.58,"to":747.49,"location":2,"content":"these three particular tasks: machine translation, summarization, question answering."},{"from":747.49,"to":751.45,"location":2,"content":"Because they are immediately useful that you don't have to explain to somebody,"},{"from":751.45,"to":754.2,"location":2,"content":"\"Oh, but why do you need the semantic role labeller or parser? \""},{"from":754.2,"to":756.49,"location":2,"content":"If you're a layman and you, you know,"},{"from":756.49,"to":758.62,"location":2,"content":"on the Internet you understand immediately why it's"},{"from":758.62,"to":761.14,"location":2,"content":"useful to do summarization, question answering,"},{"from":761.14,"to":763.24,"location":2,"content":"or translation and an improvement in"},{"from":763.24,"to":766.84,"location":2,"content":"those tasks kind of immediately translates in- into better products,"},{"from":766.84,"to":771.43,"location":2,"content":"uh, and people being able to communicate better and more efficiently with language."},{"from":771.43,"to":777.4,"location":2,"content":"So, that, uh, kind of analysis led us to think,"},{"from":777.4,"to":781.03,"location":2,"content":"uh, about these what I call three equivalent supertasks of NLP."},{"from":781.03,"to":783.91,"location":2,"content":"Uh, and basically they are"},{"from":783.91,"to":787.78,"location":2,"content":"language modeling, question answer now- question answering and dialogue systems."},{"from":787.78,"to":791.41,"location":2,"content":"Uh, language modeling, basically trying to predin- predict the next word,"},{"from":791.41,"to":792.43,"location":2,"content":"you've already worked on that."},{"from":792.43,"to":798.77,"location":2,"content":"Uh, and usually it's only used to rescore or basically to pre-train these days."},{"from":798.77,"to":802.64,"location":2,"content":"But really if you ask me a question and then you try to predict the next couple of words,"},{"from":802.64,"to":805.43,"location":2,"content":"then that is also language modeling"},{"from":805.43,"to":808.81,"location":2,"content":"and if you're able to predict the next couple of words after a question, like,"},{"from":808.81,"to":812.35,"location":2,"content":"what were the named entities in the sentence and then you just generate, you know,"},{"from":812.35,"to":814.12,"location":2,"content":"Dresden was a location,"},{"from":814.12,"to":816.43,"location":2,"content":"Richard was a person and whatnot."},{"from":816.43,"to":821.14,"location":2,"content":"Uh, then you can kind of cast almost all of these tasks into language modeling."},{"from":821.14,"to":822.58,"location":2,"content":"Uh, similarly question answering,"},{"from":822.58,"to":824.08,"location":2,"content":"you can ask any kind of question,"},{"from":824.08,"to":825.43,"location":2,"content":"what is the translation,"},{"from":825.43,"to":828.12,"location":2,"content":"what's the summary, uh, and so on,"},{"from":828.12,"to":830.77,"location":2,"content":"and then with dialogue right now it's kind of tricky because there are"},{"from":830.77,"to":835.93,"location":2,"content":"no really good dialogue datasets out there and a lot of times you want some interaction,"},{"from":835.93,"to":840.01,"location":2,"content":"you have to run user studies and most of the existing NLP task would"},{"from":840.01,"to":844.36,"location":2,"content":"basically be pretty short one-step dialogues like what are the named entity tags,"},{"from":844.36,"to":845.56,"location":2,"content":"and you give them and that's it."},{"from":845.56,"to":849.85,"location":2,"content":"So it's a little bit overkill and because of that we basically converged,"},{"from":849.85,"to":853.52,"location":2,"content":"uh, on question answering as our main formalism."},{"from":853.52,"to":858.36,"location":2,"content":"And here is now an overview of the 10 different tasks that we have,"},{"from":858.36,"to":861.61,"location":2,"content":"uh, and we cast all of them as question answering."},{"from":861.61,"to":865.12,"location":2,"content":"These are literally the tr- the training,"},{"from":865.12,"to":867.7,"location":2,"content":"uh, the format of the training dataset, uh,"},{"from":867.7,"to":870.88,"location":2,"content":"and eventually also the way we formulate"},{"from":870.88,"to":875.53,"location":2,"content":"the test set and you'll see basically for every single task,"},{"from":875.53,"to":878.61,"location":2,"content":"you have a context as some kind of document."},{"from":878.61,"to":879.7,"location":2,"content":"It could be a Wikipedia article,"},{"from":879.7,"to":881.5,"location":2,"content":"it could be a tweet, it could be a longer document,"},{"from":881.5,"to":885.55,"location":2,"content":"whatever, and you ask a question about it and you want to generate an answer."},{"from":885.55,"to":889.09,"location":2,"content":"And I'm actually -- I'm curious if you can think of any task in NLP"},{"from":889.09,"to":892.79,"location":2,"content":"that couldn't be formulated in this kind of structure."},{"from":892.79,"to":895.72,"location":2,"content":"Uh, so, let's go over some of these."},{"from":895.72,"to":897.87,"location":2,"content":"Uh, the first one is sort of the standard,"},{"from":897.87,"to":900.14,"location":2,"content":"uh, task that all- you're all familiar with now."},{"from":900.14,"to":902.44,"location":2,"content":"The SQuAD, Stanford Question Answering Dataset."},{"from":902.44,"to":906.88,"location":2,"content":"Uh, where the answer is essentially a phrase somewhere in the context."},{"from":906.88,"to":912.26,"location":2,"content":"But then, uh, the second one is something that you would never see in most,"},{"from":912.26,"to":916.9,"location":2,"content":"uh, generalized, uh, question answering workshops and that is, uh,"},{"from":916.9,"to":920.56,"location":2,"content":"having a context of the single sentence asking what is the translation from"},{"from":920.56,"to":925.09,"location":2,"content":"English into German and the output is again a sequence of words but in this case,"},{"from":925.09,"to":926.5,"location":2,"content":"and we color them differently here."},{"from":926.5,"to":931.87,"location":2,"content":"Uh, this is blue because all these words are basically not in the context and not in"},{"from":931.87,"to":935.11,"location":2,"content":"the question and we will just generate them"},{"from":935.11,"to":939.28,"location":2,"content":"with a standard softmax to basically answer this question."},{"from":939.28,"to":943.39,"location":2,"content":"We can also ask what is the summary and you can see that those"},{"from":943.39,"to":947.29,"location":2,"content":"two in some ways is artificial to make them into a natural language question."},{"from":947.29,"to":951.25,"location":2,"content":"You could just say translate or summarize and this is just like"},{"from":951.25,"to":956.14,"location":2,"content":"one kind of task token in your network but actually half of these tasks."},{"from":956.14,"to":962.3,"location":2,"content":"It makes sense because the question also has ac- is different for every example."},{"from":962.3,"to":966.04,"location":2,"content":"So this one here is natural language inference, NLI, uh,"},{"from":966.04,"to":970.92,"location":2,"content":"She covered also where we want to ask whether two sentences entail each other,"},{"from":970.92,"to":974.81,"location":2,"content":"contradict each other or there's some neutral relationship between them."},{"from":974.81,"to":976.9,"location":2,"content":"You've seen a lot of sentiment."},{"from":976.9,"to":978.58,"location":2,"content":"And this here is kind of important."},{"from":978.58,"to":982.6,"location":2,"content":"We actually asked is this sentence positive or negative versus just what is the sentiment"},{"from":982.6,"to":987.75,"location":2,"content":"and what- why that is important is that you see here in green,"},{"from":987.75,"to":990.76,"location":2,"content":"this answer here actually comes from"},{"from":990.76,"to":994.38,"location":2,"content":"a word into question and if we formulate it that way,"},{"from":994.38,"to":999.33,"location":2,"content":"we can eventually do zero-shot learning where we ask a new question that was"},{"from":999.33,"to":1004.15,"location":2,"content":"never asked before for a new set of labels and magically, in some cases,"},{"from":1004.15,"to":1006.18,"location":2,"content":"it still actually works and we'll, you know,"},{"from":1006.18,"to":1010.5,"location":2,"content":"ask que- we can ask questions like is this story happy or sad and it will still"},{"from":1010.5,"to":1012.12,"location":2,"content":"give us an answer even though we've never given"},{"from":1012.12,"to":1015.2,"location":2,"content":"it a trained dataset of a bunch of happy and sad stories."},{"from":1015.2,"to":1019.74,"location":2,"content":"So, it's kind of zero-shot classification that you get to in"},{"from":1019.74,"to":1022.23,"location":2,"content":"some cases if you formulate your questions in a way"},{"from":1022.23,"to":1025.27,"location":2,"content":"that the answer is part as a word in the question."},{"from":1025.27,"to":1028.34,"location":2,"content":"Then we have semantic role labeling here."},{"from":1028.34,"to":1035.54,"location":2,"content":"So what has something experienced, kind of a random weird question."},{"from":1035.54,"to":1038.45,"location":2,"content":"Then we have a zero-shot relation extraction who is"},{"from":1038.45,"to":1042.26,"location":2,"content":"the illustrator of Cycle of the Werewolf,"},{"from":1042.26,"to":1044.58,"location":2,"content":"we also have some dialogue state tracking."},{"from":1044.58,"to":1048.62,"location":2,"content":"What is the current state in- in a dialogue and the context just keeps on"},{"from":1048.62,"to":1053.98,"location":2,"content":"growing with the dialogue and then we also have SQL,"},{"from":1053.98,"to":1057.69,"location":2,"content":"Wiki SQL translation tasks but not translating into"},{"from":1057.69,"to":1062.03,"location":2,"content":"another natural language translating into a SQL database query."},{"from":1062.03,"to":1063.72,"location":2,"content":"It's actually a super-helpful task."},{"from":1063.72,"to":1067.83,"location":2,"content":"There's a, you know, a lot of data out there that is stored in databases."},{"from":1067.83,"to":1070.44,"location":2,"content":"If you can access it without having to ask"},{"from":1070.44,"to":1073.38,"location":2,"content":"somebody who knows how to program SQL it will make"},{"from":1073.38,"to":1076.2,"location":2,"content":"that data available to a lot more people so"},{"from":1076.2,"to":1079.26,"location":2,"content":"they can analyze it and like business analytics and so on."},{"from":1079.26,"to":1082.74,"location":2,"content":"And then here, Winograd Schemas and anaphora resolution."},{"from":1082.74,"to":1086.1,"location":2,"content":"Uh, some people call this kind of common sense reasoning but it's kind of,"},{"from":1086.1,"to":1090.22,"location":2,"content":"you know, mostly just anaphora resolution trying to understand in this context."},{"from":1090.22,"to":1092.38,"location":2,"content":"Uh, what -- who's, you know,"},{"from":1092.38,"to":1095.55,"location":2,"content":"uh, the word like who had given help,"},{"from":1095.55,"to":1099.03,"location":2,"content":"was it Susan or Joanne, and then based on this context,"},{"from":1099.03,"to":1102.9,"location":2,"content":"you can kind of should be able to figure that out and again here,"},{"from":1102.9,"to":1106.86,"location":2,"content":"the question is different for every single example. All right, yeah?"},{"from":1106.86,"to":1109.89,"location":2,"content":"When you're testing it -- like when you ask,"},{"from":1109.89,"to":1111.8,"location":2,"content":"is this sentence positive or negative,"},{"from":1111.8,"to":1115.29,"location":2,"content":"does it sometimes, like, [inaudible]?"},{"from":1115.29,"to":1117.77,"location":2,"content":"Great question. So, the question is when I ask,"},{"from":1117.77,"to":1120.51,"location":2,"content":"is this sentence positive or negative will it sometimes eventually"},{"from":1120.51,"to":1123.91,"location":2,"content":"accidentally switch to a different one of the task and, uh,"},{"from":1123.91,"to":1127.11,"location":2,"content":"we actually have a slide on that and the answer is it's surprisingly good at"},{"from":1127.11,"to":1132.78,"location":2,"content":"knowing how to go about doing the task and where to get the answer where it's from."},{"from":1132.78,"to":1136.86,"location":2,"content":"Um, and yeah, they'll make more sense in a couple of slides once we go over the model."},{"from":1136.86,"to":1138.56,"location":2,"content":"Any other questions about,"},{"from":1138.56,"to":1140.82,"location":2,"content":"uh, the question answering formalism?"},{"from":1140.82,"to":1144.93,"location":2,"content":"Are you able to formulate text generation in the question answer format as well?"},{"from":1144.93,"to":1146.68,"location":2,"content":"Like, tell me a story."},{"from":1146.68,"to":1150.19,"location":2,"content":"Good question. So can we do text generation, uh,"},{"from":1150.19,"to":1151.8,"location":2,"content":"like tell me a story, uh,"},{"from":1151.8,"to":1154.59,"location":2,"content":"from a random kind of -- or in this kind of formalism."},{"from":1154.59,"to":1159.45,"location":2,"content":"Uh, we don't have that as a task because largely it's really hard to evaluate."},{"from":1159.45,"to":1162.12,"location":2,"content":"It'll tell you some random stuff and then is that a good story or not,"},{"from":1162.12,"to":1164.33,"location":2,"content":"is it grammatical, you have to come up with a lot of,"},{"from":1164.33,"to":1165.75,"location":2,"content":"uh, sort of, uh,"},{"from":1165.75,"to":1168.42,"location":2,"content":"evaluation metrics which we actually are doing for"},{"from":1168.42,"to":1171.33,"location":2,"content":"some of the dialogue systems and in case of dialogue,"},{"from":1171.33,"to":1173.28,"location":2,"content":"why does -- why are they equivalent because"},{"from":1173.28,"to":1176.16,"location":2,"content":"the context can just keep on growing and every time, uh,"},{"from":1176.16,"to":1178.39,"location":2,"content":"the user said something, uh,"},{"from":1178.39,"to":1183.53,"location":2,"content":"you basically try to then predict the next answer in that dialogue."},{"from":1183.53,"to":1188.7,"location":2,"content":"And so I think you could very easily [NOISE] use this to generate texts."},{"from":1188.7,"to":1191.22,"location":2,"content":"Uh, you basically just ask -- tell it like what is, you know,"},{"from":1191.22,"to":1194.49,"location":2,"content":"what's a good ending of the story and you maybe start the context with like"},{"from":1194.49,"to":1198.42,"location":2,"content":"two or three words and then you ask the model to generate more and more words,"},{"from":1198.42,"to":1201.97,"location":2,"content":"uh, in the form of this network I'll describe in a second. Yeah?"},{"from":1201.97,"to":1204.72,"location":2,"content":"I was wondering like, uh, when you're training"},{"from":1204.72,"to":1207.8,"location":2,"content":"it and you're trying to research like a new task."},{"from":1207.8,"to":1211.47,"location":2,"content":"Uh, does it like learn with less data?"},{"from":1211.47,"to":1214.32,"location":2,"content":"That is an amazingly thoughtful question"},{"from":1214.32,"to":1216.93,"location":2,"content":"and it's- it's so important we'll have a bunch of slides on it."},{"from":1216.93,"to":1220.98,"location":2,"content":"So maybe we'll- we'll go -- we'll continue and we'll get to that question, uh,"},{"from":1220.98,"to":1225.08,"location":2,"content":"in a lot of detail because it's sort of why we're doing it and, the short answer is yes."},{"from":1225.08,"to":1227.86,"location":2,"content":"But we'll get to more details. All right."},{"from":1227.86,"to":1230.21,"location":2,"content":"So these are basically the 10 tasks."},{"from":1230.21,"to":1233.97,"location":2,"content":"Uh, and again this is the actual format for it."},{"from":1233.97,"to":1235.89,"location":2,"content":"So if you have a problem,"},{"from":1235.89,"to":1237.81,"location":2,"content":"and you can cast it in this format, uh,"},{"from":1237.81,"to":1240.63,"location":2,"content":"you can just take, uh, the open source code and run it and,"},{"from":1240.63,"to":1242.03,"location":2,"content":"uh, it'll- it'll work."},{"from":1242.03,"to":1245.01,"location":2,"content":"And so when you kind of analyze and think about what we've done here."},{"from":1245.01,"to":1247.68,"location":2,"content":"In some ways, we've taken the tasks that"},{"from":1247.68,"to":1250.95,"location":2,"content":"usually is kind of in your head but it's not given to the model."},{"from":1250.95,"to":1254.73,"location":2,"content":"The model is just given an input x and an output y in almost all of"},{"from":1254.73,"to":1260.76,"location":2,"content":"the supervised systems and instead we're actually including the task in the inputs,"},{"from":1260.76,"to":1265.95,"location":2,"content":"uh, in the set of inputs to the model. So you can kind of call this meta-supervised learning."},{"from":1265.95,"to":1268.26,"location":2,"content":"So again the question, uh,"},{"from":1268.26,"to":1271.14,"location":2,"content":"is kind of our task definition for each of these different tasks."},{"from":1271.14,"to":1273.57,"location":2,"content":"The model has to figure out itself when to ask the question"},{"from":1273.57,"to":1276.18,"location":2,"content":"that way it can also figure out itself when to"},{"from":1276.18,"to":1281.57,"location":2,"content":"transfer knowledge from these other tasks and y is again just the answer."},{"from":1281.57,"to":1285.33,"location":2,"content":"So, in some ways it's meta-supervised learning and I'm quite excited"},{"from":1285.33,"to":1289.56,"location":2,"content":"because once you allow the task to be given to the model as input,"},{"from":1289.56,"to":1292.17,"location":2,"content":"it can kind of decide itself how to go about"},{"from":1292.17,"to":1295.02,"location":2,"content":"solving that particular task and now you can learn,"},{"from":1295.02,"to":1296.84,"location":2,"content":"uh, a lot more powerful models."},{"from":1296.84,"to":1299.31,"location":2,"content":"So once we had the dataset,"},{"from":1299.31,"to":1302.27,"location":2,"content":"we thought \"Okay, how do we now solve this problem?\""},{"from":1302.27,"to":1303.96,"location":2,"content":"The simplest way is you could just say, \"Well,"},{"from":1303.96,"to":1305.01,"location":2,"content":"I have a big if statement,"},{"from":1305.01,"to":1307.26,"location":2,"content":"I have a classifier in the beginning and then I classify."},{"from":1307.26,"to":1309.22,"location":2,"content":"If this is a machine translation task,"},{"from":1309.22,"to":1311.02,"location":2,"content":"then run my machine translation model.\""},{"from":1311.02,"to":1314.3,"location":2,"content":"And in general, in Python that would still be just like one big python,"},{"from":1314.3,"to":1316.43,"location":2,"content":"uh, model with a bunch of if statements, right?"},{"from":1316.43,"to":1318.77,"location":2,"content":"And that's not the goal because then we wouldn't get to any of"},{"from":1318.77,"to":1322.19,"location":2,"content":"the transfer learning and zero-shot capabilities that we're hoping for."},{"from":1322.19,"to":1327.63,"location":2,"content":"So [NOISE] we want to have the model wanted"},{"from":1327.63,"to":1330.11,"location":2,"content":"to have the capability to internally adjust"},{"from":1330.11,"to":1335.36,"location":2,"content":"to these different tasks and make these decisions itself."},{"from":1335.36,"to":1338.49,"location":2,"content":"And basically, all of those considerations and all"},{"from":1338.49,"to":1340.62,"location":2,"content":"of those thoughts led us, uh, to this model."},{"from":1340.62,"to":1342.12,"location":2,"content":"So before I go, uh,"},{"from":1342.12,"to":1343.45,"location":2,"content":"into a little bit more detail."},{"from":1343.45,"to":1345.83,"location":2,"content":"I'll just like sort of give you the high-level overview."},{"from":1345.83,"to":1347.92,"location":2,"content":"Again, you start with the context."},{"from":1347.92,"to":1350.71,"location":2,"content":"Um, you start- you ask a question about, uh,"},{"from":1350.71,"to":1353.7,"location":2,"content":"that context document, and then we're going to generate,"},{"from":1353.7,"to":1358.56,"location":2,"content":"uh, the answer one word at a time by either pointing to the context,"},{"from":1358.56,"to":1360.05,"location":2,"content":"and you've had pointers already, right?"},{"from":1360.05,"to":1364.04,"location":2,"content":"Pointer networks, all that? Great. Um, pointing to a question word,"},{"from":1364.04,"to":1368.19,"location":2,"content":"or choosing a word from an external vocabulary with your standard softmax classifier."},{"from":1368.19,"to":1372.63,"location":2,"content":"Uh, and we'll have a pointer switch mechanism that will kind"},{"from":1372.63,"to":1377.41,"location":2,"content":"of choose how much to weight [NOISE] each of these three generation mechanisms."},{"from":1377.41,"to":1380.76,"location":2,"content":"So, uh, let's dig into a little bit into this model."},{"from":1380.76,"to":1384.6,"location":2,"content":"Fortunately, uh, in some ways it's kind of just taking the best, uh,"},{"from":1384.6,"to":1389.16,"location":2,"content":"of the current sort of the state of the art techniques and putting them together in a way,"},{"from":1389.16,"to":1391.56,"location":2,"content":"uh, that- that generalize well enough."},{"from":1391.56,"to":1394.14,"location":2,"content":"Uh, you can look at all the code on decanlp.com,"},{"from":1394.14,"to":1396.87,"location":2,"content":"[NOISE] it has like thousands of, uh,"},{"from":1396.87,"to":1400.4,"location":2,"content":"stars and, uh, and forks and stuff combined, uh,"},{"from":1400.4,"to":1401.8,"location":2,"content":"and you can, you know,"},{"from":1401.8,"to":1404.18,"location":2,"content":"basically run everything, uh,"},{"from":1404.18,"to":1409.76,"location":2,"content":"in this, uh, on these experiments with just one command."},{"from":1409.76,"to":1413.61,"location":2,"content":"It'll double, you get all the datasets and everything and- and run everything,"},{"from":1413.61,"to":1416.34,"location":2,"content":"you can really explore what it looks like but let's- let's"},{"from":1416.34,"to":1419.37,"location":2,"content":"dive a little bit into the details of what this model told us."},{"from":1419.37,"to":1421.07,"location":2,"content":"In some ways again, it just kind of takes"},{"from":1421.07,"to":1423.87,"location":2,"content":"all the best ingredients from deep learning [NOISE] NLP,"},{"from":1423.87,"to":1428.49,"location":2,"content":"most of which you've already learned about and puts them together in a reasonable way."},{"from":1428.49,"to":1430.47,"location":2,"content":"So we start with fixed GloVe embeddings."},{"from":1430.47,"to":1432.63,"location":2,"content":"Eventually, we'll- we updated, uh,"},{"from":1432.63,"to":1434.73,"location":2,"content":"the embeddings to CoVe embeddings, uh,"},{"from":1434.73,"to":1437.71,"location":2,"content":"and probably it'll work even better if you update them to BERT embeddings."},{"from":1437.71,"to":1440.82,"location":2,"content":"Uh, but at some point we kind of have to move on and do other things."},{"from":1440.82,"to":1443.46,"location":2,"content":"Uh, but basically, you have a fixed set of word vectors,"},{"from":1443.46,"to":1445.86,"location":2,"content":"and that is kind of important because in some of these,"},{"from":1445.86,"to":1448.55,"location":2,"content":"uh, data sets, they're much smaller than others."},{"from":1448.55,"to":1450.36,"location":2,"content":"Uh, and as you know from SQuAD,"},{"from":1450.36,"to":1452.58,"location":2,"content":"if you actually backpropagate into the word vectors,"},{"from":1452.58,"to":1454.68,"location":2,"content":"you just do really, really well on your trained dataset,"},{"from":1454.68,"to":1458.31,"location":2,"content":"but then you won't generalize because of most of the [NOISE] text,"},{"from":1458.31,"to":1461.43,"location":2,"content":"uh, test documents will include words you've never seen before."},{"from":1461.43,"to":1464.64,"location":2,"content":"So if you change all the word vectors during training, uh,"},{"from":1464.64,"to":1468.3,"location":2,"content":"it won't- it won't work very well at test time and won't generalize the unseen words."},{"from":1468.3,"to":1470.36,"location":2,"content":"So, uh, fixed GloVe embeddings,"},{"from":1470.36,"to":1471.99,"location":2,"content":"if you don't have word vectors, uh,"},{"from":1471.99,"to":1475.14,"location":2,"content":"for unseen words, we also have character n-gram embeddings."},{"from":1475.14,"to":1477.87,"location":2,"content":"Then we pipe them through a simple linear layer,"},{"from":1477.87,"to":1479.25,"location":2,"content":"and then we have a shared, uh,"},{"from":1479.25,"to":1482.54,"location":2,"content":"bidirectional LSTM with skip connections."},{"from":1482.54,"to":1486.26,"location":2,"content":"And so, uh, it's a deep- deep one so you skip to higher layers,"},{"from":1486.26,"to":1489.09,"location":2,"content":"and it's shared between the context and the questions."},{"from":1489.09,"to":1491.85,"location":2,"content":"So they have basically the same [NOISE] set of weights."},{"from":1491.85,"to":1496.44,"location":2,"content":"[NOISE] Then, uh, we have a co-attention layer."},{"from":1496.44,"to":1498.84,"location":2,"content":"Uh, where we basically just have outer products, uh,"},{"from":1498.84,"to":1503.4,"location":2,"content":"between all the hidden states of those two sequences,"},{"from":1503.4,"to":1506.07,"location":2,"content":"and again, have skip connections, uh,"},{"from":1506.07,"to":1508.05,"location":2,"content":"to circumvent, uh, those as well."},{"from":1508.05,"to":1511.2,"location":2,"content":"So now you have kind of context or question dependent, uh,"},{"from":1511.2,"to":1515.46,"location":2,"content":"contextual representations [NOISE] or- or representations of that context."},{"from":1515.46,"to":1518.97,"location":2,"content":"[NOISE] Uh, then we feed those into our transformer layers,"},{"from":1518.97,"to":1523.58,"location":2,"content":"uh, and we actually tried to use transformers for all the things,"},{"from":1523.58,"to":1525.77,"location":2,"content":"with having no LSTMs or any of that."},{"from":1525.77,"to":1528.73,"location":2,"content":"Uh, unfortunately, transformer layers were still, uh,"},{"from":1528.73,"to":1532.59,"location":2,"content":"very, uh, finicky and very hard to optimize,"},{"from":1532.59,"to":1535.02,"location":2,"content":"and there's a lot of trickery with- of the learning rates,"},{"from":1535.02,"to":1538.52,"location":2,"content":"and we could just not get them to perform really well,"},{"from":1538.52,"to":1541.76,"location":2,"content":"uh, on- on these 10 different tasks."},{"from":1541.76,"to":1545.76,"location":2,"content":"Uh, [NOISE] sometimes you had one transformer layer, one transformer network,"},{"from":1545.76,"to":1546.93,"location":2,"content":"that worked really well in one task,"},{"from":1546.93,"to":1549.33,"location":2,"content":"but the only other transformer network that worked well"},{"from":1549.33,"to":1551.89,"location":2,"content":"on the second task had like half the layers."},{"from":1551.89,"to":1555.15,"location":2,"content":"And once you tried to have one network with the same number of layers,"},{"from":1555.15,"to":1557.71,"location":2,"content":"it just wouldn't work on either of the two tasks anymore."},{"from":1557.71,"to":1560.64,"location":2,"content":"Uh, and so- so yeah, unfortunately as nice as they"},{"from":1560.64,"to":1563.58,"location":2,"content":"are because they're nicely paralyzable in GPUs,"},{"from":1563.58,"to":1565.11,"location":2,"content":"uh, they weren't yet robust enough,"},{"from":1565.11,"to":1566.82,"location":2,"content":"uh, to- to be used for this."},{"from":1566.82,"to":1569.28,"location":2,"content":"[NOISE] So we have to have these LSTMs,"},{"from":1569.28,"to":1571.2,"location":2,"content":"uh, before and after the transformer layers."},{"from":1571.2,"to":1575.3,"location":2,"content":"[NOISE] And then we essentially just have a standard sort of autoregressive, uh,"},{"from":1575.3,"to":1577.77,"location":2,"content":"decoder where given the last state,"},{"from":1577.77,"to":1579.72,"location":2,"content":"uh, we generate the next word."},{"from":1579.72,"to":1582.09,"location":2,"content":"And then we have these three pointer mechanisms."},{"from":1582.09,"to":1584.46,"location":2,"content":"Uh, they're very similar to the pointer ne- mechanisms you already know."},{"from":1584.46,"to":1588.4,"location":2,"content":"But now on top of these very contextualized representations, uh,"},{"from":1588.4,"to":1590.58,"location":2,"content":"at the end of this encoder, uh,"},{"from":1590.58,"to":1593.64,"location":2,"content":"and it basically learns to either point to question words,"},{"from":1593.64,"to":1595.77,"location":2,"content":"context words based on the hidden states,"},{"from":1595.77,"to":1598.13,"location":2,"content":"or have also a standard softmax,"},{"from":1598.13,"to":1601.39,"location":2,"content":"and then we just basically have a weighted sum,"},{"from":1601.39,"to":1605.49,"location":2,"content":"convex sum, of these three different distributions of output words."},{"from":1605.49,"to":1608.12,"location":2,"content":"[NOISE] All right."},{"from":1608.12,"to":1612.69,"location":2,"content":"So I think these are mostly standard components that you've already saw,"},{"from":1612.69,"to":1614.61,"location":2,"content":"uh, for you- already seen all their details."},{"from":1614.61,"to":1615.94,"location":2,"content":"But if you have any questions,"},{"from":1615.94,"to":1618.69,"location":2,"content":"um, about how we put it together? Yeah?"},{"from":1618.69,"to":1622.92,"location":2,"content":"[NOISE] So the output- the output has to be a word."},{"from":1622.92,"to":1626.61,"location":2,"content":"That's right. The output has to be a word and it's always either a word from the context,"},{"from":1626.61,"to":1628.47,"location":2,"content":"a word from the question or a word from the softmax."},{"from":1628.47,"to":1631.05,"location":2,"content":"[NOISE]"},{"from":1631.05,"to":1635.61,"location":2,"content":"That's- the data preprocessing I guess it's different with each task."},{"from":1635.61,"to":1638.22,"location":2,"content":"So the data preprocessing is different for each task,"},{"from":1638.22,"to":1640.95,"location":2,"content":"but we basically had to normalize everything to have"},{"from":1640.95,"to":1643.71,"location":2,"content":"the same tokenization and- and all of that. [NOISE]"},{"from":1643.71,"to":1649.77,"location":2,"content":"Uh, so do the double arrows in the encoding just represent there's a bidirectional?"},{"from":1649.77,"to":1650.13,"location":2,"content":"Yeah."},{"from":1650.13,"to":1650.78,"location":2,"content":"Okay."},{"from":1650.78,"to":1652.39,"location":2,"content":"Yeah. But the double arrows,"},{"from":1652.39,"to":1654,"location":2,"content":"uh, here are just bidirectional."},{"from":1654,"to":1658.08,"location":2,"content":"So left to right and right to left for the LSTMs. All right."},{"from":1658.08,"to":1661.05,"location":2,"content":"So what datasets, uh, are we using?"},{"from":1661.05,"to":1664.13,"location":2,"content":"Uh, I mentioned that that was a big headache in the beginning."},{"from":1664.13,"to":1666.54,"location":2,"content":"Uh, we definitely wanted to include a lot of the sequence to"},{"from":1666.54,"to":1669.72,"location":2,"content":"sequence tasks that we felt like are very,"},{"from":1669.72,"to":1674.06,"location":2,"content":"um, sort of high level and I- immediately useful, uh,"},{"from":1674.06,"to":1677.95,"location":2,"content":"and in some ways what this also shows you is that"},{"from":1677.95,"to":1683.31,"location":2,"content":"nowadays you don't have to work as much on some of the intermediate representations,"},{"from":1683.31,"to":1685.28,"location":2,"content":"uh, in NLP anymore."},{"from":1685.28,"to":1689.49,"location":2,"content":"Uh, you can just directly go for the end tasks that that real users might care about,"},{"from":1689.49,"to":1692.34,"location":2,"content":"and then have these end-to-end trainable systems,"},{"from":1692.34,"to":1694.69,"location":2,"content":"uh, that really do quite well."},{"from":1694.69,"to":1697.29,"location":2,"content":"And, uh, I've myself worked a lot on parsing."},{"from":1697.29,"to":1698.41,"location":2,"content":"And so I don't wanna, you know,"},{"from":1698.41,"to":1699.54,"location":2,"content":"say we- we don't need it."},{"from":1699.54,"to":1701.58,"location":2,"content":"There's certainly still tasks that you do need it for,"},{"from":1701.58,"to":1706.1,"location":2,"content":"but it's kind of surprising that you can just go directly to translation or summarization"},{"from":1706.1,"to":1708.87,"location":2,"content":"without having intermediate representations that"},{"from":1708.87,"to":1712.04,"location":2,"content":"were sort of very specifically hand-designed."},{"from":1712.04,"to":1716.31,"location":2,"content":"Um, so we had those three really interesting, uh, and hard tasks."},{"from":1716.31,"to":1718.38,"location":2,"content":"Question answering, machine translation, summarization."},{"from":1718.38,"to":1721.26,"location":2,"content":"They actually also have the three biggest datasets,"},{"from":1721.26,"to":1722.82,"location":2,"content":"uh, of all of these."},{"from":1722.82,"to":1726.96,"location":2,"content":"Uh, then we had NLI, and basically, um,"},{"from":1726.96,"to":1732.19,"location":2,"content":"all of these, uh, 10 datasets [NOISE] were, uh,"},{"from":1732.19,"to":1736.88,"location":2,"content":"publicly available, uh, and in several cases especially for translation,"},{"from":1736.88,"to":1741.03,"location":2,"content":"you could actually find much larger, uh, translation datasets,"},{"from":1741.03,"to":1743.79,"location":2,"content":"but we also tried to keep it, uh,"},{"from":1743.79,"to":1748.53,"location":2,"content":"to a- to a size where normal people that don't work in gigantic companies with huge, uh,"},{"from":1748.53,"to":1753.54,"location":2,"content":"GPU infrastructures could still run experiments, [NOISE] uh, themselves."},{"from":1753.54,"to":1756.63,"location":2,"content":"So universities and folks, uh, can still run it on."},{"from":1756.63,"to":1758.98,"location":2,"content":"Basically if you have just a single GPU,"},{"from":1758.98,"to":1761.38,"location":2,"content":"it'll probably take about a week or so, uh,"},{"from":1761.38,"to":1763.68,"location":2,"content":"to run an experiment."},{"from":1763.68,"to":1766.63,"location":2,"content":"If you have multiple GPUs on one large AWS machine,"},{"from":1766.63,"to":1769.56,"location":2,"content":"you can kind of run an experiment in a day or two."},{"from":1769.56,"to":1771.75,"location":2,"content":"And so especially for translation, right,"},{"from":1771.75,"to":1775.61,"location":2,"content":"you could get a lot more data, uh, than IWSLT."},{"from":1775.61,"to":1778.47,"location":2,"content":"And each of these, uh,"},{"from":1778.47,"to":1782.1,"location":2,"content":"communities and datasets and- and tasks has their own metric."},{"from":1782.1,"to":1784.05,"location":2,"content":"We actually tried to, in the beginning,"},{"from":1784.05,"to":1786.33,"location":2,"content":"we had a lot of discussion about how we should"},{"from":1786.33,"to":1789.87,"location":2,"content":"define the measure of success for this project."},{"from":1789.87,"to":1791.57,"location":2,"content":"Uh, it doesn't make sense, uh,"},{"from":1791.57,"to":1795.3,"location":2,"content":"to have a normalized F1 score for basically all the different tasks,"},{"from":1795.3,"to":1797.31,"location":2,"content":"but then we basically realized that"},{"from":1797.31,"to":1800.25,"location":2,"content":"these different communities have different metrics for a reason."},{"from":1800.25,"to":1805.01,"location":2,"content":"Uh, unfortunately at least all of these metrics are from 0-100 in theory."},{"from":1805.01,"to":1807.4,"location":2,"content":"Of course, in practice, you rarely ever see, uh,"},{"from":1807.4,"to":1810.27,"location":2,"content":"a translation system of a 100, uh,"},{"from":1810.27,"to":1812.28,"location":2,"content":"or even high 90s of a BLEU score,"},{"from":1812.28,"to":1814.93,"location":2,"content":"uh, or these really, really high ROUGE scores."},{"from":1814.93,"to":1818.55,"location":2,"content":"But, you know, in theory they go from 0-100, and so, uh,"},{"from":1818.55,"to":1824.04,"location":2,"content":"we kept basically intact the different evaluation metrics for each of these communities,"},{"from":1824.04,"to":1826.44,"location":2,"content":"and we just said we're going to sum them up."},{"from":1826.44,"to":1829.38,"location":2,"content":"And, uh, when we first talked about this,"},{"from":1829.38,"to":1831.15,"location":2,"content":"we have- had a lot of discussion,"},{"from":1831.15,"to":1832.89,"location":2,"content":"uh, with- with others also like, oh,"},{"from":1832.89,"to":1835.53,"location":2,"content":"but translation is so much more important because it's much"},{"from":1835.53,"to":1838.24,"location":2,"content":"bigger and it's a much more useful task than you still,"},{"from":1838.24,"to":1840.63,"location":2,"content":"you know, silly like pronoun resolution Winograd Schemas"},{"from":1840.63,"to":1843.15,"location":2,"content":"which only have a couple hundred training samples."},{"from":1843.15,"to":1845.73,"location":2,"content":"And so you should have weighted translation more and"},{"from":1845.73,"to":1848.31,"location":2,"content":"then literally five questions later somebody's like,"},{"from":1848.31,"to":1850.14,"location":2,"content":"\"Why didn't you weight pronoun resolution more?"},{"from":1850.14,"to":1854.37,"location":2,"content":"That is a really hard task that captures sort of common sense reasoning and, you know,"},{"from":1854.37,"to":1856.59,"location":2,"content":"the complexity of language and semantics,"},{"from":1856.59,"to":1860.34,"location":2,"content":"and unlike all this, like, statistical pattern matching [NOISE] that you do in translation.\""},{"from":1860.34,"to":1863.19,"location":2,"content":"And I was like, I used to talk to that guy [LAUGHTER] and like,"},{"from":1863.19,"to":1864.51,"location":2,"content":"uh, hopefully in the end,"},{"from":1864.51,"to":1868.05,"location":2,"content":"we'll just all agree that like it's reasonable to sum them up, uh,"},{"from":1868.05,"to":1873.64,"location":2,"content":"and of course, you also have to tackle when you run experiments in this."},{"from":1873.64,"to":1877.85,"location":2,"content":"Uh, a lot of the complexity that you have in machine learning and,"},{"from":1877.85,"to":1881.63,"location":2,"content":"you know, stuff that very few people talk about like having very skewed distributions."},{"from":1881.63,"to":1884.61,"location":2,"content":"So you have translation which has, uh,"},{"from":1884.61,"to":1886.62,"location":2,"content":"millions or hundreds of thousands of examples,"},{"from":1886.62,"to":1887.73,"location":2,"content":"and you have Winograd Schemas,"},{"from":1887.73,"to":1889.92,"location":2,"content":"uh, that only have a couple hundred."},{"from":1889.92,"to":1894.75,"location":2,"content":"How do you train that such that you don't just completely ignore the smaller dataset."},{"from":1894.75,"to":1898.35,"location":2,"content":"Uh, so we'll get to some of the optimization trickery,"},{"from":1898.35,"to":1902.01,"location":2,"content":"uh, that Nitish spent several months on in a bit."},{"from":1902.01,"to":1905.31,"location":2,"content":"But I first wanna sort of give you the first set of experiments."},{"from":1905.31,"to":1906.96,"location":2,"content":"So as you can see from all the numbers,"},{"from":1906.96,"to":1908.57,"location":2,"content":"there's a lot of experiments, uh,"},{"from":1908.57,"to":1910.69,"location":2,"content":"that we ran to even get to this,"},{"from":1910.69,"to":1912.96,"location":2,"content":"and so we'll walk through this, uh, quite carefully."},{"from":1912.96,"to":1916.11,"location":2,"content":"I think hopefully you'll get some ideas also for- for ablations,"},{"from":1916.11,"to":1919.8,"location":2,"content":"or experiments that you might wanna run in your, um,"},{"from":1919.8,"to":1921.21,"location":2,"content":"in your experiments and in your,"},{"from":1921.21,"to":1923.67,"location":2,"content":"uh, problem- final- final projects."},{"from":1923.67,"to":1925.29,"location":2,"content":"So what are we looking at here?"},{"from":1925.29,"to":1927.4,"location":2,"content":"So basically, uh, on the left side,"},{"from":1927.4,"to":1928.77,"location":2,"content":"we have single task performance."},{"from":1928.77,"to":1933.47,"location":2,"content":"So here, each number comes from its different model that was trained,"},{"from":1933.47,"to":1936.33,"location":2,"content":"um, separately on just one task."},{"from":1936.33,"to":1942.54,"location":2,"content":"Uh, each row- each column here is the same architecture, uh,"},{"from":1942.54,"to":1943.93,"location":2,"content":"and [NOISE] on the right side here,"},{"from":1943.93,"to":1945.43,"location":2,"content":"we basically have, uh,"},{"from":1945.43,"to":1951.16,"location":2,"content":"for each column is basically the same architecture and the same exact model."},{"from":1951.16,"to":1954.67,"location":2,"content":"So here, we have four different models and here, uh,"},{"from":1954.67,"to":1957.16,"location":2,"content":"we have 40 different models,"},{"from":1957.16,"to":1960.11,"location":2,"content":"and each column again is the same architecture."},{"from":1960.11,"to":1961.72,"location":2,"content":"And so the simplest, uh,"},{"from":1961.72,"to":1964.62,"location":2,"content":"first column here is just a standard sequence to sequence"},{"from":1964.62,"to":1968.28,"location":2,"content":"model with very few bells and whistles and some pointers,"},{"from":1968.28,"to":1969.96,"location":2,"content":"but nothing sort of major."},{"from":1969.96,"to":1971.27,"location":2,"content":"It's pretty deep, you know,"},{"from":1971.27,"to":1973.55,"location":2,"content":"stack bidirectional LSTM skip connections,"},{"from":1973.55,"to":1977.78,"location":2,"content":"all the standard good well-tuned stuff for sequence to sequence models."},{"from":1977.78,"to":1980.94,"location":2,"content":"And, uh, then we added self-attention."},{"from":1980.94,"to":1983.4,"location":2,"content":"Um, this- this sort of, uh,"},{"from":1983.4,"to":1986.31,"location":2,"content":"basically, uh, transformer layers."},{"from":1986.31,"to":1988.11,"location":2,"content":"[NOISE] Then we have this co-attention layer of"},{"from":1988.11,"to":1990.22,"location":2,"content":"the outer products that we mentioned in the beginning,"},{"from":1990.22,"to":1992.71,"location":2,"content":"and then we also added the question pointer."},{"from":1992.71,"to":1998.33,"location":2,"content":"So having the ability to point to a word in a question."},{"from":1998.33,"to":2001.67,"location":2,"content":"All right. Any questions about this table?"},{"from":2001.67,"to":2003.32,"location":2,"content":"We'll dig into some of the details."},{"from":2003.32,"to":2005.09,"location":2,"content":"Uh, okay. Well, we'll dig into"},{"from":2005.09,"to":2007.76,"location":2,"content":"the details first and then maybe you can think of some questions."},{"from":2007.76,"to":2009.83,"location":2,"content":"So let's analyze, uh,"},{"from":2009.83,"to":2012.74,"location":2,"content":"what's going on in this table because there are a lot of numbers, uh,"},{"from":2012.74,"to":2016.51,"location":2,"content":"and you really want to carefully analyze and sort of distinguish."},{"from":2016.51,"to":2017.89,"location":2,"content":"I think my first, uh,"},{"from":2017.89,"to":2020.59,"location":2,"content":"observation was, wow, we can have a single architecture."},{"from":2020.59,"to":2023.17,"location":2,"content":"Like, even, even this is not quite what we want, right?"},{"from":2023.17,"to":2024.54,"location":2,"content":"We want a single model."},{"from":2024.54,"to":2026.14,"location":2,"content":"But even this kind of showed us, wow,"},{"from":2026.14,"to":2031.43,"location":2,"content":"you can have a single architecture that actually does really well and somewhat randomly,"},{"from":2031.43,"to":2033.92,"location":2,"content":"in some cases, it actually had gotten state-of-the-art results."},{"from":2033.92,"to":2036.02,"location":2,"content":"So Wiki SQL, for instance,"},{"from":2036.02,"to":2039.2,"location":2,"content":"this architecture had the best model"},{"from":2039.2,"to":2042.24,"location":2,"content":"to translate natural language English questions into SQL queries,"},{"from":2042.24,"to":2045.53,"location":2,"content":"which was a surprise to us because it is the ninth dataset."},{"from":2045.53,"to":2048.95,"location":2,"content":"It was really not like a priority for us and when we designed"},{"from":2048.95,"to":2052.97,"location":2,"content":"the model and thought about how to generate words and pointer mechanisms and so on."},{"from":2052.97,"to":2056.39,"location":2,"content":"We just kind of had the standard context of SQL words"},{"from":2056.39,"to":2059.99,"location":2,"content":"and we asked the question what's the translation to SQL, and then, uh,"},{"from":2059.99,"to":2064.79,"location":2,"content":"somewhat surprisingly to us this particular architecture had the state-of-the-art, uh,"},{"from":2064.79,"to":2067.82,"location":2,"content":"on SQL generation and bunch of folks in that community kind"},{"from":2067.82,"to":2070.86,"location":2,"content":"of picked it up more quickly because it had state-of-the-art."},{"from":2070.86,"to":2072.59,"location":2,"content":"And that's- uh, unfortunately,"},{"from":2072.59,"to":2074.91,"location":2,"content":"it doesn't have that many other state-of-the-art numbers, uh,"},{"from":2074.91,"to":2076.4,"location":2,"content":"which is why it's harder, uh,"},{"from":2076.4,"to":2077.75,"location":2,"content":"it's actually a much harder task."},{"from":2077.75,"to":2080.2,"location":2,"content":"And what you also observe is that,"},{"from":2080.2,"to":2082.32,"location":2,"content":"uh, in several of the cases, uh,"},{"from":2082.32,"to":2084.08,"location":2,"content":"using the multitask model,"},{"from":2084.08,"to":2086.64,"location":2,"content":"so having a single model for all the 10 tasks,"},{"from":2086.64,"to":2088.88,"location":2,"content":"uh, actually hurts performance at first."},{"from":2088.88,"to":2092.12,"location":2,"content":"And this is also something you rarely read in papers because papers"},{"from":2092.12,"to":2095.21,"location":2,"content":"have a strong selection bias to only publish positive results."},{"from":2095.21,"to":2100.31,"location":2,"content":"Uh, and when you look at most transfer learning and multitask learning papers,"},{"from":2100.31,"to":2104.66,"location":2,"content":"they're sort of an outside of the actual model consideration of like,"},{"from":2104.66,"to":2109.1,"location":2,"content":"well, let's only combine tasks that we know will work well with one another."},{"from":2109.1,"to":2111.05,"location":2,"content":"And if they don't work and hurt performance,"},{"from":2111.05,"to":2113.28,"location":2,"content":"then we'd just exclude them from our experiments."},{"from":2113.28,"to":2116.61,"location":2,"content":"And so you don't see many negative task results, uh,"},{"from":2116.61,"to":2120.22,"location":2,"content":"in the literature and there are a few papers here and there that, uh,"},{"from":2120.22,"to":2124.91,"location":2,"content":"study basically the opposite side of transfer learning and that is,"},{"from":2124.91,"to":2128.32,"location":2,"content":"uh, catastrophic interference and catastrophic forgetting."},{"from":2128.32,"to":2132.11,"location":2,"content":"So interference is when you train two different tasks in the same model,"},{"from":2132.11,"to":2135.16,"location":2,"content":"and to interfere with one another next, you hurt each other's performance."},{"from":2135.16,"to":2137.96,"location":2,"content":"And catastrophic forgetting is if you train continually"},{"from":2137.96,"to":2141.3,"location":2,"content":"your first train in one task then you train on a second task,"},{"from":2141.3,"to":2142.89,"location":2,"content":"people used to think,"},{"from":2142.89,"to":2144.08,"location":2,"content":"\"Oh, well, you know,"},{"from":2144.08,"to":2145.79,"location":2,"content":"basically the first task will be completely"},{"from":2145.79,"to":2148.97,"location":2,"content":"forgotten,\" and you just work well on the second task."},{"from":2148.97,"to":2152.75,"location":2,"content":"If you train neural networks sort of in a sequential way one task and then"},{"from":2152.75,"to":2156.85,"location":2,"content":"another and somewhat surprisingly, uh,"},{"from":2156.85,"to":2159.16,"location":2,"content":"we- we found that things aren't actually"},{"from":2159.16,"to":2161.93,"location":2,"content":"catastrophically being forgotten in these models,"},{"from":2161.93,"to":2164.41,"location":2,"content":"turns out that if you train them sequentially and"},{"from":2164.41,"to":2167.07,"location":2,"content":"you add a little bit of the original to the first task,"},{"from":2167.07,"to":2168.76,"location":2,"content":"it comes back very, very quickly."},{"from":2168.76,"to":2170.66,"location":2,"content":"So while the performance is really bad,"},{"from":2170.66,"to":2172.91,"location":2,"content":"you can get to the really good performance very,"},{"from":2172.91,"to":2174.47,"location":2,"content":"very quickly in very few iterations."},{"from":2174.47,"to":2178.11,"location":2,"content":"So but it's one of the many interesting sort of tidbits that we found,"},{"from":2178.11,"to":2180.91,"location":2,"content":"uh, in the course of this that we haven't even published yet. All right."},{"from":2180.91,"to":2184.05,"location":2,"content":"So, uh, focusing on, uh,"},{"from":2184.05,"to":2186.56,"location":2,"content":"the transformer layers here we basically find transformers"},{"from":2186.56,"to":2189.28,"location":2,"content":"do help the original sequence to sequence model a lot."},{"from":2189.28,"to":2193.41,"location":2,"content":"So if you tune them carefully and you combine them with, uh,"},{"from":2193.41,"to":2196.24,"location":2,"content":"some bidirectional LSTMs and so on, uh,"},{"from":2196.24,"to":2198.41,"location":2,"content":"they were very helpful and improved, uh,"},{"from":2198.41,"to":2201.8,"location":2,"content":"across a bunch of different datasets, in some cases quite significantly."},{"from":2201.8,"to":2206.39,"location":2,"content":"Another observation is question-answering and semantic role labeling,"},{"from":2206.39,"to":2209.66,"location":2,"content":"uh, actually can predict each other's performance quite well."},{"from":2209.66,"to":2211.67,"location":2,"content":"If one works well, the other works well,"},{"from":2211.67,"to":2213.14,"location":2,"content":"uh, and- and vice-versa."},{"from":2213.14,"to":2214.4,"location":2,"content":"If they don't work well,"},{"from":2214.4,"to":2216.59,"location":2,"content":"uh, both of them don't work very well."},{"from":2216.59,"to":2220.85,"location":2,"content":"Um, and it's also interesting because both of those tasks have different questions for,"},{"from":2220.85,"to":2224.07,"location":2,"content":"uh, every training example."},{"from":2224.07,"to":2227.78,"location":2,"content":"Pointing. Uh, so the question pointing,"},{"from":2227.78,"to":2229.52,"location":2,"content":"uh, is super important."},{"from":2229.52,"to":2231.7,"location":2,"content":"Uh, we actually have in some cases, uh,"},{"from":2231.7,"to":2233.91,"location":2,"content":"twice the performance even for,"},{"from":2233.91,"to":2235.57,"location":2,"content":"and this is kind of surprising to us,"},{"from":2235.57,"to":2238.7,"location":2,"content":"a simple classification task where you could just have a standard Softmax."},{"from":2238.7,"to":2242.64,"location":2,"content":"But instead of saying you have a Softmax of entailment, contradiction, and so on,"},{"from":2242.64,"to":2245.01,"location":2,"content":"you just basically, uh,"},{"from":2245.01,"to":2248.01,"location":2,"content":"point to the word entailment in the question."},{"from":2248.01,"to":2252.05,"location":2,"content":"And that was also the case for Winograd Schemas that also benefited a lot,"},{"from":2252.05,"to":2254,"location":2,"content":"uh, from this pointer mechanism."},{"from":2254,"to":2256.19,"location":2,"content":"[NOISE]"},{"from":2256.19,"to":2256.88,"location":2,"content":"Can you explain that?"},{"from":2256.88,"to":2259.49,"location":2,"content":"Sure. Um, can we explain it? Why-"},{"from":2259.49,"to":2261.47,"location":2,"content":"[inaudible]"},{"from":2261.47,"to":2262.76,"location":2,"content":"Why does it help so much?"},{"from":2262.76,"to":2264.98,"location":2,"content":"Um, in some ways,"},{"from":2264.98,"to":2267.86,"location":2,"content":"I think partly is the whole architecture"},{"from":2267.86,"to":2271.16,"location":2,"content":"has been gotten- has gotten better and better at pointing."},{"from":2271.16,"to":2273.32,"location":2,"content":"And part of the reason we actually do very,"},{"from":2273.32,"to":2274.73,"location":2,"content":"very poorly in translation,"},{"from":2274.73,"to":2279.02,"location":2,"content":"which is the only task that hurt in the- our first experiments a lot, uh,"},{"from":2279.02,"to":2282.5,"location":2,"content":"in the multitask setting is that that is the only task that now has to generate,"},{"from":2282.5,"to":2285.44,"location":2,"content":"uh, results from a completely separate Softmax,"},{"from":2285.44,"to":2287.66,"location":2,"content":"whereas the rest of the architecture got really,"},{"from":2287.66,"to":2292.53,"location":2,"content":"really good at pointing to things to answer questions, any kind of question."},{"from":2292.53,"to":2295.55,"location":2,"content":"Uh, and so but in some ways,"},{"from":2295.55,"to":2297.56,"location":2,"content":"I think that is one explanation,"},{"from":2297.56,"to":2299.72,"location":2,"content":"but I- I don't think it's- it's all of it."},{"from":2299.72,"to":2309.01,"location":2,"content":"I think we still need to figure out more why this happens. All right."},{"from":2309.01,"to":2312.2,"location":2,"content":"Now, multitask learning is the most"},{"from":2312.2,"to":2315.47,"location":2,"content":"helpful when it comes to zero-shot and I'm actually very excited about that."},{"from":2315.47,"to":2319.84,"location":2,"content":"So this is a zero-shot relation extraction where you have different kinds of, uh,"},{"from":2319.84,"to":2322.43,"location":2,"content":"relations that you might wanna extract and you might have never"},{"from":2322.43,"to":2325.55,"location":2,"content":"seen like the student-teacher relationship that you're trying"},{"from":2325.55,"to":2327.86,"location":2,"content":"to identify in a certain context or"},{"from":2327.86,"to":2331.74,"location":2,"content":"a product company relationship or something like that."},{"from":2331.74,"to":2335.48,"location":2,"content":"And so, uh, that one actually, uh,"},{"from":2335.48,"to":2338.18,"location":2,"content":"benefited a lot and almost got twice, uh,"},{"from":2338.18,"to":2340.28,"location":2,"content":"as high in terms of the accuracy, uh,"},{"from":2340.28,"to":2342.38,"location":2,"content":"when you learned it with everything else."},{"from":2342.38,"to":2344.36,"location":2,"content":"So these were questions, it's never seen before,"},{"from":2344.36,"to":2346.26,"location":2,"content":"relations that it's never seen before,"},{"from":2346.26,"to":2348.72,"location":2,"content":"and it got twice as good, uh,"},{"from":2348.72,"to":2353.21,"location":2,"content":"and benefited a lot especially from having seen other kinds of questions."},{"from":2353.21,"to":2356.87,"location":2,"content":"And in some ways, we have to give a lot of credit to SQuAD too,"},{"from":2356.87,"to":2358.89,"location":2,"content":"uh, because SQuAD as a dataset,"},{"from":2358.89,"to":2364.76,"location":2,"content":"uh, kind of pushed people into thinking about pointers as a mechanism to generate answers."},{"from":2364.76,"to":2368.75,"location":2,"content":"And pointers, we kind of see them like as a given and they don't get that much credit,"},{"from":2368.75,"to":2373.53,"location":2,"content":"but they allow you to predict answers that you've never seen before at training time."},{"from":2373.53,"to":2376.04,"location":2,"content":"To generate words, you've never seen before at training time,"},{"from":2376.04,"to":2379.85,"location":2,"content":"which is actually quite- quite amazing. All right."},{"from":2379.85,"to":2383.09,"location":2,"content":"Now, the main observation though"},{"from":2383.09,"to":2386.81,"location":2,"content":"here is that you still if you had an Oracle that would tell you"},{"from":2386.81,"to":2390.28,"location":2,"content":"exactly which task you're currently in"},{"from":2390.28,"to":2394.68,"location":2,"content":"and you would be perfectly kind of separating these into 10 different models,"},{"from":2394.68,"to":2398.95,"location":2,"content":"maybe they're all the same architecture but there's still 10 different models, then, uh,"},{"from":2398.95,"to":2402.41,"location":2,"content":"you would actually still do slightly better,"},{"from":2402.41,"to":2406.53,"location":2,"content":"uh, than the first version of this multitask learning model."},{"from":2406.53,"to":2409.07,"location":2,"content":"And that is largely because we"},{"from":2409.07,"to":2412.43,"location":2,"content":"chose to include a bunch of different tasks that have nothing to do"},{"from":2412.43,"to":2415.13,"location":2,"content":"with one another and we wanted the community to start"},{"from":2415.13,"to":2418.31,"location":2,"content":"thinking about tackling catastrophic interference, right?"},{"from":2418.31,"to":2421.68,"location":2,"content":"If you learn like a new language or, you know,"},{"from":2421.68,"to":2424.67,"location":2,"content":"you learn how to understand social media on Twitter,"},{"from":2424.67,"to":2426.86,"location":2,"content":"you don't replace all your language,"},{"from":2426.86,"to":2428.82,"location":2,"content":"uh, you know, in- in your brain."},{"from":2428.82,"to":2430.82,"location":2,"content":"You have one brain, it keeps getting smarter,"},{"from":2430.82,"to":2432.07,"location":2,"content":"you keep learning new skills,"},{"from":2432.07,"to":2435.14,"location":2,"content":"even when that skills that are new to you are very,"},{"from":2435.14,"to":2436.52,"location":2,"content":"very different from old skills."},{"from":2436.52,"to":2440.42,"location":2,"content":"So in some ways we may have made our lives too hard,"},{"from":2440.42,"to":2441.77,"location":2,"content":"and now we're actually thinking, okay,"},{"from":2441.77,"to":2444.62,"location":2,"content":"maybe if you wanna publish a nicer paper on multitask learning,"},{"from":2444.62,"to":2446.81,"location":2,"content":"we'll just look at all the tasks that do help each other,"},{"from":2446.81,"to":2448.88,"location":2,"content":"and then we'll just, you know, have groups of tasks,"},{"from":2448.88,"to":2451.45,"location":2,"content":"and then I can very quickly publish,"},{"from":2451.45,"to":2454.01,"location":2,"content":"uh, some, some nice state-of-the-art papers."},{"from":2454.01,"to":2457.37,"location":2,"content":"But basically here, uh, we're still, uh,"},{"from":2457.37,"to":2463.91,"location":2,"content":"quite significantly away in the decaScore between 10 different models and a single model."},{"from":2463.91,"to":2466.28,"location":2,"content":"Now, this of course is kind of an oracle score,"},{"from":2466.28,"to":2469.8,"location":2,"content":"that's why we put it in parentheses because you don't actually have this oracle."},{"from":2469.8,"to":2471.26,"location":2,"content":"And in some cases,"},{"from":2471.26,"to":2473.78,"location":2,"content":"it's quite easy to build an almost perfect classifier."},{"from":2473.78,"to":2476.61,"location":2,"content":"So, you know, separating what is the summary"},{"from":2476.61,"to":2479.81,"location":2,"content":"based on that question and what is the translation from English to German,"},{"from":2479.81,"to":2481.61,"location":2,"content":"you can do with almost 100 percent accuracy."},{"from":2481.61,"to":2485.09,"location":2,"content":"Uh, but, uh, SQuAD, question-answering,"},{"from":2485.09,"to":2486.66,"location":2,"content":"and zero-shot relation extraction,"},{"from":2486.66,"to":2489.57,"location":2,"content":"and question-answering as a semantic role labeling,"},{"from":2489.57,"to":2493.22,"location":2,"content":"those are actually easily confused in terms of how"},{"from":2493.22,"to":2497.33,"location":2,"content":"to generate the answers and you wouldn't quite know,"},{"from":2497.33,"to":2500.87,"location":2,"content":"uh, which into which model to route, uh, this."},{"from":2500.87,"to":2504.93,"location":2,"content":"So in some sense, this is kind of theoretical. All right."},{"from":2504.93,"to":2507.71,"location":2,"content":"Now, I mentioned that we have this prob- this"},{"from":2507.71,"to":2511.73,"location":2,"content":"complexity in the optimization strategy and this is one of the many,"},{"from":2511.73,"to":2515.8,"location":2,"content":"um, sort of problems that don't get that much, uh, coverage."},{"from":2515.8,"to":2517.53,"location":2,"content":"But when you have a very,"},{"from":2517.53,"to":2519.78,"location":2,"content":"uh, imbalanced or skewed dataset,"},{"from":2519.78,"to":2525.01,"location":2,"content":"it's easy to lose track and basically overpower the smaller dataset tasks."},{"from":2525.01,"to":2527.51,"location":2,"content":"And so, uh, the first, uh,"},{"from":2527.51,"to":2530.78,"location":2,"content":"simplest training- we actually tried a ton of different training strategies,"},{"from":2530.78,"to":2533.6,"location":2,"content":"but in the end, this fully joint one worked quite well."},{"from":2533.6,"to":2538.16,"location":2,"content":"But actually promised to ask go wait for questions, uh, on this table."},{"from":2538.16,"to":2540.68,"location":2,"content":"So any questions on all these results so far? Yeah?"},{"from":2540.68,"to":2544.55,"location":2,"content":"So, uh, [NOISE] since you mentioned that if you had"},{"from":2544.55,"to":2546.74,"location":2,"content":"an oracle that will tell you which task it is and"},{"from":2546.74,"to":2549.22,"location":2,"content":"you have two better ways having 10 different ones."},{"from":2549.22,"to":2552.44,"location":2,"content":"So really try training a model on"},{"from":2552.44,"to":2555.71,"location":2,"content":"like data meaning what task is interested in this particular version?"},{"from":2555.71,"to":2558.31,"location":2,"content":"We did. And so it- it confused, you know,"},{"from":2558.31,"to":2562.24,"location":2,"content":"SQuAD and- and those too the quest- the other- basically the other,"},{"from":2562.24,"to":2567.26,"location":2,"content":"uh, two types of problems that were also cast, ask question answering."},{"from":2567.26,"to":2569.35,"location":2,"content":"So it confused those."},{"from":2569.35,"to":2573.49,"location":2,"content":"Um, but then a lot of the others, it was able to like, very perfectly do it."},{"from":2573.49,"to":2576.19,"location":2,"content":"But then you basically, as soon as you,"},{"from":2576.19,"to":2581.11,"location":2,"content":"uh, were to try to then build a whole model and get a decaScore,"},{"from":2581.11,"to":2585.39,"location":2,"content":"if your- if your classifier is even like 90 percent accurate,"},{"from":2585.39,"to":2588.53,"location":2,"content":"you basically multiply this by 0.9 and"},{"from":2588.53,"to":2591.68,"location":2,"content":"you get dinged so hard that it- it's not competitive anymore."},{"from":2591.68,"to":2594.35,"location":2,"content":"So it is actually hard if you try to just build"},{"from":2594.35,"to":2597.08,"location":2,"content":"that whole system and keep adding sort of if-then else statements,"},{"from":2597.08,"to":2598.88,"location":2,"content":"uh, to make that, uh,"},{"from":2598.88,"to":2600.89,"location":2,"content":"into sort of a single system. Yeah?"},{"from":2600.89,"to":2604.09,"location":2,"content":"Have you tried telling the model what kind of task this it's doing,"},{"from":2604.09,"to":2607.33,"location":2,"content":"just giving that indicator of the kind of task quickly?"},{"from":2607.33,"to":2609.01,"location":2,"content":"I mean, in some ways,"},{"from":2609.01,"to":2610.12,"location":2,"content":"we did in this case,"},{"from":2610.12,"to":2613.36,"location":2,"content":"because we only trained each model separately on it."},{"from":2613.36,"to":2614.28,"location":2,"content":"[inaudible]"},{"from":2614.28,"to":2616.91,"location":2,"content":"Um, only through the question."},{"from":2616.91,"to":2619.18,"location":2,"content":"Yeah. Because I was thinking the"},{"from":2619.18,"to":2622.76,"location":2,"content":"um, maybe it's not that important that the model figure out what we want it to"},{"from":2622.76,"to":2624.97,"location":2,"content":"do in- in a practical [NOISE] application"},{"from":2624.97,"to":2627.56,"location":2,"content":"if we could just tell it what we want it to do right now?"},{"from":2627.56,"to":2629.42,"location":2,"content":"In some cases, you could tell."},{"from":2629.42,"to":2631.43,"location":2,"content":"Uh, so the question is sort of,"},{"from":2631.43,"to":2633.26,"location":2,"content":"uh, and even in the multitask setting,"},{"from":2633.26,"to":2636.09,"location":2,"content":"you could have like an extra kind of token to say,"},{"from":2636.09,"to":2638.15,"location":2,"content":"\"Now, you're doing summarization."},{"from":2638.15,"to":2639.95,"location":2,"content":"So, and that's another input.\""},{"from":2639.95,"to":2641.26,"location":2,"content":"Uh, in some ways,"},{"from":2641.26,"to":2643.61,"location":2,"content":"whether you have a summarization token,"},{"from":2643.61,"to":2645.65,"location":2,"content":"uh, or you ask what is the summary?"},{"from":2645.65,"to":2648.13,"location":2,"content":"It actually I don't think makes that big of a difference."},{"from":2648.13,"to":2651.19,"location":2,"content":"It's just now you can query this model in"},{"from":2651.19,"to":2653.14,"location":2,"content":"very natural language rather than having to know"},{"from":2653.14,"to":2655.6,"location":2,"content":"kind of a special token to, to query the model."},{"from":2655.6,"to":2659.71,"location":2,"content":"Uh, and we'll see actually in a couple of slides that the model is not confused,"},{"from":2659.71,"to":2662.86,"location":2,"content":"uh, when it comes to how to generate the answers."},{"from":2662.86,"to":2664.71,"location":2,"content":"So, for every of the task,"},{"from":2664.71,"to":2668.66,"location":2,"content":"it knows very clearly how to generate the words to get to the right,"},{"from":2668.66,"to":2670.7,"location":2,"content":"to get to, you know, a reasonably accurate answer."},{"from":2670.7,"to":2676.52,"location":2,"content":"[NOISE] Um, in the- [inaudible] does the model"},{"from":2676.52,"to":2682.58,"location":2,"content":"see all of the data and then [inaudible] that class or does it only include a [inaudible]?"},{"from":2682.58,"to":2685.4,"location":2,"content":"Oh, great question. So, how do we train, uh, the single task models?"},{"from":2685.4,"to":2687.98,"location":2,"content":"They're only trained on that dataset."},{"from":2687.98,"to":2691.7,"location":2,"content":"So, the SQuAD number here is just a single model that has only seen SQuAD training."},{"from":2691.7,"to":2697.25,"location":2,"content":"[NOISE] So, your point about the,"},{"from":2697.25,"to":2699.05,"location":2,"content":"um, the pointer exception for the, uh,"},{"from":2699.05,"to":2702.31,"location":2,"content":"[inaudible] generally more helpful than [inaudible]?"},{"from":2702.31,"to":2704.83,"location":2,"content":"Somewhat surprisingly, even, ah,"},{"from":2704.83,"to":2706.32,"location":2,"content":"in the case here, uh,"},{"from":2706.32,"to":2709.07,"location":2,"content":"where we had, um, this is MultiNLI,"},{"from":2709.07,"to":2710.69,"location":2,"content":"this particular model, I mean,"},{"from":2710.69,"to":2712.55,"location":2,"content":"if you just have the standard sequence to sequence,"},{"from":2712.55,"to":2714.03,"location":2,"content":"it just generates, you know,"},{"from":2714.03,"to":2716.66,"location":2,"content":"also with a softmax, uh, that label."},{"from":2716.66,"to":2718.64,"location":2,"content":"So in that sense, it's quite similar."},{"from":2718.64,"to":2723.65,"location":2,"content":"Uh, but yeah, it was actually better able to just point, which actually led us, uh,"},{"from":2723.65,"to":2727.73,"location":2,"content":"for a while into thinking about maybe we should have a project where we just say point to"},{"from":2727.73,"to":2732.13,"location":2,"content":"all the things and just get rid of softmax classifiers forever."},{"from":2732.13,"to":2735.89,"location":2,"content":"Um, the problem is when you then try to do translation also,"},{"from":2735.89,"to":2737.21,"location":2,"content":"it's like okay wow,"},{"from":2737.21,"to":2738.39,"location":2,"content":"what do you point to,"},{"from":2738.39,"to":2740.42,"location":2,"content":"and then you kind of pre-train it and do"},{"from":2740.42,"to":2743.75,"location":2,"content":"some alignment and it gets kinda very large and you point to a lot of different like,"},{"from":2743.75,"to":2746.36,"location":2,"content":"you may have like- like tens of thousands of potential candidates."},{"from":2746.36,"to":2749.54,"location":2,"content":"So we kinda discarded it as like a single unifying model for all the things,"},{"from":2749.54,"to":2751.89,"location":2,"content":"but you could point to a lot of different,"},{"from":2751.89,"to":2752.99,"location":2,"content":"like a lot of these tasks,"},{"from":2752.99,"to":2754.28,"location":2,"content":"you could actually point to and"},{"from":2754.28,"to":2761.44,"location":2,"content":"I think it's another interesting side project that could spawn from this, yeah."},{"from":2761.44,"to":2763.74,"location":2,"content":"Just a quick question to how,"},{"from":2763.74,"to":2766.91,"location":2,"content":"how sensitive [inaudible] how sensitive, uh,"},{"from":2766.91,"to":2769.85,"location":2,"content":"the individual components [inaudible] was when you"},{"from":2769.85,"to":2773.24,"location":2,"content":"slightly perturb the relative weights of them in the loss function?"},{"from":2773.24,"to":2776.86,"location":2,"content":"So, we -- the question is, uh, how, um,"},{"from":2776.86,"to":2779.8,"location":2,"content":"sensitive were the tasks if we were to,"},{"from":2779.8,"to":2782.82,"location":2,"content":"um, add weights to the different tasks?"},{"from":2782.82,"to":2787.49,"location":2,"content":"We [NOISE] did in the optimization kind of did a lot of trickery on"},{"from":2787.49,"to":2792.08,"location":2,"content":"how to train it but we never said this task only matters like 0.5 or something."},{"from":2792.08,"to":2794.93,"location":2,"content":"So, we didn't do that analysis. Yeah?"},{"from":2794.93,"to":2797.99,"location":2,"content":"Co-attention seems to be a burden a little bit."},{"from":2797.99,"to":2799.07,"location":2,"content":"In some cases, yeah."},{"from":2799.07,"to":2804.43,"location":2,"content":"Is it the [inaudible] co-attention and order but no co-attention or is that kind of like,"},{"from":2804.43,"to":2807.32,"location":2,"content":"\"Oh, you already saw the test data so, like, you can't use these.\""},{"from":2807.32,"to":2809.05,"location":2,"content":"I mean, these are all dep sets."},{"from":2809.05,"to":2813.56,"location":2,"content":"Um, but it's, you could definitely do even more architecture engineering."},{"from":2813.56,"to":2815.9,"location":2,"content":"In fact, there's this whole field which I don't think"},{"from":2815.9,"to":2818.69,"location":2,"content":"you gotten to, right, neural architecture search?"},{"from":2818.69,"to":2822.51,"location":2,"content":"Yeah. So like you can actually combine your reinforcement learning, um,"},{"from":2822.51,"to":2825.7,"location":2,"content":"and you say the action space for the reinforcement learning agent"},{"from":2825.7,"to":2827.36,"location":2,"content":"are trying to have a couple of"},{"from":2827.36,"to":2829.58,"location":2,"content":"different modules of neural nets like maybe you want to have"},{"from":2829.58,"to":2831.18,"location":2,"content":"like a CNN layer and then like"},{"from":2831.18,"to":2834.32,"location":2,"content":"a memory layer and then an LSTM layer and maybe it's bidirectional and you"},{"from":2834.32,"to":2839.47,"location":2,"content":"basically let a reinforcement learning agent figure out all of these decisions."},{"from":2839.47,"to":2842.86,"location":2,"content":"Uh, so I think it would be phenomenal to try to apply"},{"from":2842.86,"to":2845.21,"location":2,"content":"neural architecture search not to what's"},{"from":2845.21,"to":2847.79,"location":2,"content":"usually being done which is we already know how to do image classification,"},{"from":2847.79,"to":2850.72,"location":2,"content":"we'll just do it slightly better with NAS, neural architecture search."},{"from":2850.72,"to":2851.93,"location":2,"content":"But we actually try to find"},{"from":2851.93,"to":2854.81,"location":2,"content":"a single architecture for multi-task learning which we don't know."},{"from":2854.81,"to":2858.62,"location":2,"content":"The problem of course is that already getting to these."},{"from":2858.62,"to":2861.47,"location":2,"content":"All these numbers took a lot of compute time and a lot of"},{"from":2861.47,"to":2864.88,"location":2,"content":"fiddling around with stuff and it is, I can,"},{"from":2864.88,"to":2868.99,"location":2,"content":"I can only give you sort of an idea of like how often we'd say,"},{"from":2868.99,"to":2870.89,"location":2,"content":"\"Oh man, we got like this really amazing result"},{"from":2870.89,"to":2873.11,"location":2,"content":"in this task but it needed this learning rate.\""},{"from":2873.11,"to":2875,"location":2,"content":"And it turns out the same model,"},{"from":2875,"to":2877.1,"location":2,"content":"same set of hyperparameters everything,"},{"from":2877.1,"to":2881.55,"location":2,"content":"but this other task to get to good performance needed a much higher learning rate."},{"from":2881.55,"to":2885.65,"location":2,"content":"And now, you try to combine those two tasks only together and you're like,"},{"from":2885.65,"to":2887.34,"location":2,"content":"\"Okay, how do you choose your learning rate now?\""},{"from":2887.34,"to":2889.07,"location":2,"content":"You choose the, you know,"},{"from":2889.07,"to":2891.65,"location":2,"content":"if you choose the task, the learning rate from the task that is, you know,"},{"from":2891.65,"to":2893.78,"location":2,"content":"bigger than the smaller tasks just doesn't work"},{"from":2893.78,"to":2895.97,"location":2,"content":"well at all because it needed this higher learning rate."},{"from":2895.97,"to":2899.41,"location":2,"content":"If you'd use the higher learning rate that the smaller task and the smaller dataset,"},{"from":2899.41,"to":2903.99,"location":2,"content":"uh, did really well on then the large one just overfits and doesn't work well either."},{"from":2903.99,"to":2905.96,"location":2,"content":"If you try to do the average, neither of the two work."},{"from":2905.96,"to":2909.56,"location":2,"content":"Like there's a lot of complexity in trying to do multitask learning."},{"from":2909.56,"to":2915.1,"location":2,"content":"That's why, that's why it's such an interesting I think, uh, research challenge."},{"from":2915.1,"to":2918.41,"location":2,"content":"All right, any more questions about this first set of results?"},{"from":2918.41,"to":2919.78,"location":2,"content":"They get, they will get better."},{"from":2919.78,"to":2922.27,"location":2,"content":"We, we have, we have had some ideas already,"},{"from":2922.27,"to":2927.25,"location":2,"content":"uh, on, on how to improve them."},{"from":2927.25,"to":2929.78,"location":2,"content":"All right. So, uh,"},{"from":2929.78,"to":2931.78,"location":2,"content":"how did we actually train this whole thing?"},{"from":2931.78,"to":2934.89,"location":2,"content":"Um, we had tried a lot of different things but in the end, uh,"},{"from":2934.89,"to":2938.99,"location":2,"content":"this very simple fully joint training strategy actually worked the best."},{"from":2938.99,"to":2942.8,"location":2,"content":"Uh, and that is you basically take a mini batch from each of"},{"from":2942.8,"to":2947.54,"location":2,"content":"the different tasks and you just train on that mini batch from that task."},{"from":2947.54,"to":2951.47,"location":2,"content":"So basically just going through all the 10 tasks and then round robin,"},{"from":2951.47,"to":2953.69,"location":2,"content":"uh, go through them."},{"from":2953.69,"to":2956.82,"location":2,"content":"Um, now it turns out, ah,"},{"from":2956.82,"to":2959.09,"location":2,"content":"that that does not work,"},{"from":2959.09,"to":2961.46,"location":2,"content":"uh, quite as well, uh,"},{"from":2961.46,"to":2966.05,"location":2,"content":"as another training strategy and if you look into optimization,"},{"from":2966.05,"to":2967.68,"location":2,"content":"uh, strategies in neural nets, uh,"},{"from":2967.68,"to":2969.17,"location":2,"content":"there are actually a couple of papers on"},{"from":2969.17,"to":2971.72,"location":2,"content":"so-called curriculum learning, where the idea is,"},{"from":2971.72,"to":2976.43,"location":2,"content":"you start with training your model with simple pro- simple instances of your problems."},{"from":2976.43,"to":2978.83,"location":2,"content":"So, in translation, for instance you start training with"},{"from":2978.83,"to":2981.99,"location":2,"content":"very short sentences and then you go to larger and larger,"},{"from":2981.99,"to":2984.56,"location":2,"content":"uh, sentences, uh, or longer and longer sentences."},{"from":2984.56,"to":2987.55,"location":2,"content":"Uh, now it turns out for multi-task learning,"},{"from":2987.55,"to":2989.28,"location":2,"content":"you actually want to do the opposite."},{"from":2989.28,"to":2992.05,"location":2,"content":"You wanna do anti-curriculum learning."},{"from":2992.05,"to":2995.33,"location":2,"content":"Uh, and that is you start with the hardest tasks and you iterate on"},{"from":2995.33,"to":2998.93,"location":2,"content":"those for a while and then you add the simple tasks later on."},{"from":2998.93,"to":3002.05,"location":2,"content":"And to some degree, I think this is intuitive because when"},{"from":3002.05,"to":3007.78,"location":2,"content":"you train this very gigantic and powerful model,"},{"from":3007.78,"to":3011.02,"location":2,"content":"uh, on a very simple task like"},{"from":3011.02,"to":3014.51,"location":2,"content":"sentiment and you just need to classify everything to be positive or negative."},{"from":3014.51,"to":3018.22,"location":2,"content":"You train all of these weights and you arrive at sort of, uh,"},{"from":3018.22,"to":3020.71,"location":2,"content":"local optima that are quite deep and very"},{"from":3020.71,"to":3024.37,"location":2,"content":"specific to just generating these two words and if you then try to get out of that,"},{"from":3024.37,"to":3027.43,"location":2,"content":"out of this local optimum for that very simple task"},{"from":3027.43,"to":3030.66,"location":2,"content":"and then try to generate all these other kinds of words and point to different,"},{"from":3030.66,"to":3033.93,"location":2,"content":"you know, words it's never seen before then SQuAD,"},{"from":3033.93,"to":3036.94,"location":2,"content":"it's very very hard to come out of that local optimum."},{"from":3036.94,"to":3040.97,"location":2,"content":"And that is sort of my intuition of why it actually makes more sense to say,"},{"from":3040.97,"to":3044.93,"location":2,"content":"\"Let's start with SQuAD and machine translation and a couple of these harder tasks."},{"from":3044.93,"to":3047.02,"location":2,"content":"We'll make the model very general purpose."},{"from":3047.02,"to":3048.91,"location":2,"content":"It has to generate a lot of different things,"},{"from":3048.91,"to":3052.24,"location":2,"content":"create a softmax, German words,"},{"from":3052.24,"to":3054.46,"location":2,"content":"it has to point to all kinds of"},{"from":3054.46,"to":3057.89,"location":2,"content":"different words and be able to parse all kinds of different Wikipedia paragraphs.\""},{"from":3057.89,"to":3061.32,"location":2,"content":"And you do that a couple of times and then once you've finished,"},{"from":3061.32,"to":3063.19,"location":2,"content":"uh, this sort of pre-training, uh,"},{"from":3063.19,"to":3069.22,"location":2,"content":"stage or anti-curriculum, then you move on and add sort of the simpler smaller tasks."},{"from":3069.22,"to":3071.59,"location":2,"content":"So [NOISE] with that, uh,"},{"from":3071.59,"to":3075.09,"location":2,"content":"relatively simple change that did take us,"},{"from":3075.09,"to":3077.45,"location":2,"content":"uh, a lot of different experiments to get to."},{"from":3077.45,"to":3080.2,"location":2,"content":"Um, we actually, uh,"},{"from":3080.2,"to":3082.05,"location":2,"content":"closed or, uh, um,"},{"from":3082.05,"to":3085.57,"location":2,"content":"went closer to closing that gap and now, um,"},{"from":3085.57,"to":3090.33,"location":2,"content":"we're only sort of, um, 14, uh, away."},{"from":3090.33,"to":3092.78,"location":2,"content":"Right, yeah, uh, 14 or so."},{"from":3092.78,"to":3095.18,"location":2,"content":"Uh, but there's still, uh,"},{"from":3095.18,"to":3097.7,"location":2,"content":"a big gap and the biggest, uh,"},{"from":3097.7,"to":3100.88,"location":2,"content":"nuisance and issue that we had was with a translation."},{"from":3100.88,"to":3102.84,"location":2,"content":"Basically, if you look at all of these,"},{"from":3102.84,"to":3104.91,"location":2,"content":"most things are kind of similar,"},{"from":3104.91,"to":3109.16,"location":2,"content":"get slightly better, um and it's sort of a toss up but then and,"},{"from":3109.16,"to":3112.13,"location":2,"content":"and roughly similar, but translation was really bad."},{"from":3112.13,"to":3113.45,"location":2,"content":"It's almost only half, uh,"},{"from":3113.45,"to":3116.42,"location":2,"content":"the performance in the multitask learning setup,"},{"from":3116.42,"to":3120.11,"location":2,"content":"and part of that is because translation was the only task that had"},{"from":3120.11,"to":3125.96,"location":2,"content":"a very large Softmax vocabulary of words that were in no other task."},{"from":3125.96,"to":3128.07,"location":2,"content":"And most of the other tasks,"},{"from":3128.07,"to":3130.43,"location":2,"content":"actually were doing really well with pointing."},{"from":3130.43,"to":3134.57,"location":2,"content":"And so, uh, my interpretation of this was that the intermediate layers,"},{"from":3134.57,"to":3136.55,"location":2,"content":"all these representations that we learned with"},{"from":3136.55,"to":3139.52,"location":2,"content":"bi-directional LSTMs and transformers, they got really,"},{"from":3139.52,"to":3141.88,"location":2,"content":"really good at being pointed to,"},{"from":3141.88,"to":3147.56,"location":2,"content":"like creating hidden representations that the answer module can point to very accurately."},{"from":3147.56,"to":3149.47,"location":2,"content":"And then you have this one task that is like,"},{"from":3149.47,"to":3151.09,"location":2,"content":"I don't point to almost anything,"},{"from":3151.09,"to":3154.24,"location":2,"content":"I basically just generate other words and then different vocabulary."},{"from":3154.24,"to":3157.61,"location":2,"content":"And so those hidden representations became less useful for that task."},{"from":3157.61,"to":3161.36,"location":2,"content":"And so, that was one of the insights and that led"},{"from":3161.36,"to":3165.02,"location":2,"content":"to one of the ways of trying to improve this."},{"from":3165.02,"to":3167.61,"location":2,"content":"Now, one of the interesting issues that we had is,"},{"from":3167.61,"to":3169.04,"location":2,"content":"when we improved the model,"},{"from":3169.04,"to":3171.5,"location":2,"content":"the multi-single model for all 10 tasks,"},{"from":3171.5,"to":3173.09,"location":2,"content":"a lot of times we said, well,"},{"from":3173.09,"to":3175.28,"location":2,"content":"but now we also have to go back and run"},{"from":3175.28,"to":3179.06,"location":2,"content":"10 more experiments on all the single tasks to have a proper comparison, right?"},{"from":3179.06,"to":3181.28,"location":2,"content":"Because if you tune the thing you care about,"},{"from":3181.28,"to":3184.79,"location":2,"content":"and you stop tuning the thing you wanna show you can do better than,"},{"from":3184.79,"to":3186.28,"location":2,"content":"then that's not fair."},{"from":3186.28,"to":3189.47,"location":2,"content":"Uh, so you always wanna give as much, uh,"},{"from":3189.47,"to":3193.66,"location":2,"content":"TLC and focus and experiment time to your baselines."},{"from":3193.66,"to":3198.67,"location":2,"content":"And so, uh, in some cases we actually,"},{"from":3198.67,"to":3202.41,"location":2,"content":"uh, improved some- improved something."},{"from":3202.41,"to":3206.49,"location":2,"content":"But then, we improve both the 10 separate models and our model,"},{"from":3206.49,"to":3209.09,"location":2,"content":"and some cases like the 10 separate models improved, even more."},{"from":3209.09,"to":3210.49,"location":2,"content":"So the gap got even larger."},{"from":3210.49,"to":3212.72,"location":2,"content":"It's kind of the opposite of what we wanted to show, but in general,"},{"from":3212.72,"to":3214.22,"location":2,"content":"it's better for both tests,"},{"from":3214.22,"to":3216.53,"location":2,"content":"uh, for the architecture overall."},{"from":3216.53,"to":3217.97,"location":2,"content":"So basically, we started, uh,"},{"from":3217.97,"to":3220.22,"location":2,"content":"with this fully joint training and we have"},{"from":3220.22,"to":3222.51,"location":2,"content":"this sort of set of single models that we could,"},{"from":3222.51,"to":3224.15,"location":2,"content":"in theory with some oracle,"},{"from":3224.15,"to":3225.34,"location":2,"content":"kind of just sum up, uh,"},{"from":3225.34,"to":3227.01,"location":2,"content":"in their scores, to get a decaScore."},{"from":3227.01,"to":3229.11,"location":2,"content":"So the gap started at 23."},{"from":3229.11,"to":3233.03,"location":2,"content":"And then, uh, we basically did this anti-curriculum training,"},{"from":3233.03,"to":3235.79,"location":2,"content":"uh, which, uh, lowered the gap to 15."},{"from":3235.79,"to":3237.38,"location":2,"content":"So we're kind of excited,"},{"from":3237.38,"to":3238.76,"location":2,"content":"uh, making good progress."},{"from":3238.76,"to":3239.93,"location":2,"content":"Then we switched, uh,"},{"from":3239.93,"to":3241.88,"location":2,"content":"from GloVe and use CoVe."},{"from":3241.88,"to":3244.05,"location":2,"content":"So contextual vectors, um,"},{"from":3244.05,"to":3246.32,"location":2,"content":"which actually increased the gap a lot again."},{"from":3246.32,"to":3249.32,"location":2,"content":"So everything got better, but the 10 separate models got"},{"from":3249.32,"to":3253,"location":2,"content":"even better than the one single model that does the 10 tasks."},{"from":3253,"to":3254.65,"location":2,"content":"Um, so the gap got bigger,"},{"from":3254.65,"to":3257.14,"location":2,"content":"but everybody's performance increased."},{"from":3257.14,"to":3259.51,"location":2,"content":"So it was still overall a good thing."},{"from":3259.51,"to":3262.78,"location":2,"content":"Uh, and then, uh, we basically figured,"},{"from":3262.78,"to":3264.61,"location":2,"content":"especially with this machine translation issue,"},{"from":3264.61,"to":3266.47,"location":2,"content":"we shouldn't just pre-train on SQuAD,"},{"from":3266.47,"to":3270.1,"location":2,"content":"but we also should include machine translation in"},{"from":3270.1,"to":3274.84,"location":2,"content":"this pre-training in the beginning so the model doesn't just start learning to point."},{"from":3274.84,"to":3277.63,"location":2,"content":"Um, and that helped us, uh,"},{"from":3277.63,"to":3280.16,"location":2,"content":"to reduce the gap between the 10 separate models,"},{"from":3280.16,"to":3283.09,"location":2,"content":"Oracle, and the single model to about five points."},{"from":3283.09,"to":3284.69,"location":2,"content":"And then, uh, we basically said,"},{"from":3284.69,"to":3286.64,"location":2,"content":"okay, translation is still not that good."},{"from":3286.64,"to":3287.78,"location":2,"content":"We just keep oversampling."},{"from":3287.78,"to":3292.76,"location":2,"content":"So, every time we go through one of these round robin mini-batch sets,"},{"from":3292.76,"to":3294.74,"location":2,"content":"we just always include machine translation."},{"from":3294.74,"to":3299.27,"location":2,"content":"And that basically allowed us to then reduce the gap,"},{"from":3299.27,"to":3301.03,"location":2,"content":"uh, to just a single point."},{"from":3301.03,"to":3303.59,"location":2,"content":"So now, uh, we started, uh,"},{"from":3303.59,"to":3306.65,"location":2,"content":"couple of, several months ago, uh, at 586."},{"from":3306.65,"to":3308.96,"location":2,"content":"And now the single, uh,"},{"from":3308.96,"to":3311.33,"location":2,"content":"oracle with 10 different models,"},{"from":3311.33,"to":3312.56,"location":2,"content":"if you were to sum them up,"},{"from":3312.56,"to":3316.1,"location":2,"content":"get 618, uh, and the, you know,"},{"from":3316.1,"to":3319.99,"location":2,"content":"better contextual vectors and tuning and adding a lot more translation,"},{"from":3319.99,"to":3323.21,"location":2,"content":"and translation is still not as good as we would like it to be, uh,"},{"from":3323.21,"to":3326.53,"location":2,"content":"but now, several of the other tasks benefited a bunch."},{"from":3326.53,"to":3330.14,"location":2,"content":"And now we're basically one decaScore away from"},{"from":3330.14,"to":3333.74,"location":2,"content":"having a single model that does as well as 10 different ones."},{"from":3333.74,"to":3336.39,"location":2,"content":"And you can basically,"},{"from":3336.39,"to":3338.53,"location":2,"content":"you could run even more experiments,"},{"from":3338.53,"to":3341.93,"location":2,"content":"in some ways you could burn millions of dollars on AWS cost here,"},{"from":3341.93,"to":3347.18,"location":2,"content":"because most of the time we kept the hyperparameters of these different models the same."},{"from":3347.18,"to":3349.39,"location":2,"content":"Like each of these, you could also say, well,"},{"from":3349.39,"to":3352.01,"location":2,"content":"maybe this multitask model needs to have 50 more layers,"},{"from":3352.01,"to":3353.72,"location":2,"content":"or maybe 19 more layers,"},{"from":3353.72,"to":3356.22,"location":2,"content":"or maybe five more layers and maybe they should be 1000,"},{"from":3356.22,"to":3357.86,"location":2,"content":"you know, wider in their hidden dimensions."},{"from":3357.86,"to":3361.31,"location":2,"content":"And you could basically run a lot more experiments."},{"from":3361.31,"to":3363.83,"location":2,"content":"Maybe hopefully, eventually, the community jointly does that,"},{"from":3363.83,"to":3366.17,"location":2,"content":"and then we can kind of move, move towards that."},{"from":3366.17,"to":3368.48,"location":2,"content":"But we figured, okay, we're pretty close,"},{"from":3368.48,"to":3373.85,"location":2,"content":"so we moved on to some other things which maybe I'll tell you about next year."},{"from":3373.85,"to":3376.72,"location":2,"content":"[LAUGHTER] But basically, um,"},{"from":3376.72,"to":3378.98,"location":2,"content":"let's do some analysis of what happened in this project."},{"from":3378.98,"to":3382.24,"location":2,"content":"And this is kind of, I think something that I would encourage you all to do as well."},{"from":3382.24,"to":3385.46,"location":2,"content":"Like you, you can chase the numbers for a while and in some ways,"},{"from":3385.46,"to":3388.39,"location":2,"content":"you should always be skeptical about your evaluations."},{"from":3388.39,"to":3389.78,"location":2,"content":"And in some cases,"},{"from":3389.78,"to":3393.23,"location":2,"content":"you've seen- we've seen in the NLP community people"},{"from":3393.23,"to":3396.93,"location":2,"content":"like basically just optimize BLEU scores for translation for years."},{"from":3396.93,"to":3398.69,"location":2,"content":"And then somebody came out with a paper and said, well,"},{"from":3398.69,"to":3404.51,"location":2,"content":"it turns out BLEU metrics and human evaluations on how good of a translation is this,"},{"from":3404.51,"to":3406.18,"location":2,"content":"aren't actually that correlated."},{"from":3406.18,"to":3408.32,"location":2,"content":"And you're like, ah, that that sucks,"},{"from":3408.32,"to":3413,"location":2,"content":"we just spent years of our lives tuning that metric and publishing a bunch of papers."},{"from":3413,"to":3417.29,"location":2,"content":"Um, and so in some ways all of these metrics have flaws, uh, you know,"},{"from":3417.29,"to":3420.14,"location":2,"content":"root scores summarization is a super,"},{"from":3420.14,"to":3423.38,"location":2,"content":"uh, subjective kind of a task."},{"from":3423.38,"to":3425.47,"location":2,"content":"And summarization, for instance,"},{"from":3425.47,"to":3427.73,"location":2,"content":"when you analyze the errors, uh,"},{"from":3427.73,"to":3430.59,"location":2,"content":"you often realize that word vectors have problems too."},{"from":3430.59,"to":3432.92,"location":2,"content":"So, for instance, the word vector for Jason, John,"},{"from":3432.92,"to":3435.29,"location":2,"content":"and Jeremy are all kind of the same, right?"},{"from":3435.29,"to":3436.94,"location":2,"content":"They all have similar, uh,"},{"from":3436.94,"to":3440.05,"location":2,"content":"distributions, similar contexts, windows, and so on."},{"from":3440.05,"to":3442.61,"location":2,"content":"And so word vectors of names are very similar."},{"from":3442.61,"to":3445.84,"location":2,"content":"And so in summarization errors, you realize, oh,"},{"from":3445.84,"to":3449.3,"location":2,"content":"well, you know, this article, news article talked about Jeremy being kidnapped."},{"from":3449.3,"to":3451.16,"location":2,"content":"But the summary said that Jason was kidnapped."},{"from":3451.16,"to":3453.65,"location":2,"content":"And you like, well, you know, in the evaluation metric"},{"from":3453.65,"to":3456.32,"location":2,"content":"that's just one word is off and like, all the rest is correct,"},{"from":3456.32,"to":3458,"location":2,"content":"but it's a pretty important word."},{"from":3458,"to":3460.97,"location":2,"content":"And so, word vectors have like issues"},{"from":3460.97,"to":3464.07,"location":2,"content":"for summarization that are pretty fundamental and I don't think,"},{"from":3464.07,"to":3466.84,"location":2,"content":"uh, anybody's tackling really well right now."},{"from":3466.84,"to":3468.88,"location":2,"content":"Uh, and so all of these metrics have issues."},{"from":3468.88,"to":3471.62,"location":2,"content":"I would argue though that combining the 10 actually"},{"from":3471.62,"to":3474.44,"location":2,"content":"makes it less problematic and more meaningful,"},{"from":3474.44,"to":3476.63,"location":2,"content":"than looking at each one separately."},{"from":3476.63,"to":3480.72,"location":2,"content":"Uh, because now you can't use the idiosyncrasies of"},{"from":3480.72,"to":3484.97,"location":2,"content":"one particular evaluation metric to just get like your score a little bit higher."},{"from":3484.97,"to":3489.74,"location":2,"content":"Um, because then, if you just tune with that particular thing in mind,"},{"from":3489.74,"to":3493.37,"location":2,"content":"it will hurt some of the other tasks and you won't get to the sort of general,"},{"from":3493.37,"to":3495.95,"location":2,"content":"uh, NLP model that much more easily."},{"from":3495.95,"to":3498.61,"location":2,"content":"All right. So now, let's do some analysis uh,"},{"from":3498.61,"to":3500.64,"location":2,"content":"of this model and, uh,"},{"from":3500.64,"to":3504.14,"location":2,"content":"look at, and this is the kinda thing that comes to one of the questions that was asked."},{"from":3504.14,"to":3508.3,"location":2,"content":"Uh, is this model able to kind of generate the right words for the right tasks?"},{"from":3508.3,"to":3511.78,"location":2,"content":"And here, we basically looked at the distributions of how often, uh,"},{"from":3511.78,"to":3517.1,"location":2,"content":"the model generated words in these differen- with these three different mechanisms,"},{"from":3517.1,"to":3520.37,"location":2,"content":"Softmax vocabulary, context pointers, or question pointers."},{"from":3520.37,"to":3522.51,"location":2,"content":"And, uh, as you can see,"},{"from":3522.51,"to":3525.5,"location":2,"content":"in the majority of cases it knows exactly how to generate."},{"from":3525.5,"to":3527.91,"location":2,"content":"So, uh, for, uh,"},{"from":3527.91,"to":3531.11,"location":2,"content":"question, answering, and semantic role labeling,"},{"from":3531.11,"to":3535.36,"location":2,"content":"and SQuAD and Wiki SQL and,"},{"from":3535.36,"to":3539.15,"location":2,"content":"um, summarization, it basically uses the context pointer."},{"from":3539.15,"to":3541.57,"location":2,"content":"So it just points into the context document."},{"from":3541.57,"to":3542.8,"location":2,"content":"And we know for SQuAD,"},{"from":3542.8,"to":3545.99,"location":2,"content":"that is basically [NOISE] how the data set was generated."},{"from":3545.99,"to":3548.6,"location":2,"content":"So that's the only thing that that really makes a lot of sense."},{"from":3548.6,"to":3551.93,"location":2,"content":"Uh, what's kind of cool is that in some cases like summarization,"},{"from":3551.93,"to":3554.24,"location":2,"content":"it sometimes creates new words or, you know,"},{"from":3554.24,"to":3557.33,"location":2,"content":"that weren't in the context document wherein pointed to."},{"from":3557.33,"to":3559.91,"location":2,"content":"Uh, and for zero-shot relation extraction,"},{"from":3559.91,"to":3561.45,"location":2,"content":"also sometimes uses, uh,"},{"from":3561.45,"to":3564.05,"location":2,"content":"this external vocabulary and in some cases the context pointer."},{"from":3564.05,"to":3566.21,"location":2,"content":"So for the most part, uh,"},{"from":3566.21,"to":3571.97,"location":2,"content":"this model doesn't- is not confused how to execute on a task given, uh,"},{"from":3571.97,"to":3575.18,"location":2,"content":"this question formalism rather than, uh, the,"},{"from":3575.18,"to":3577.37,"location":2,"content":"uh, format of sort of this is the task,"},{"from":3577.37,"to":3581.2,"location":2,"content":"just do this particular test."},{"from":3581.2,"to":3584.03,"location":2,"content":"Now, um, you might argue,"},{"from":3584.03,"to":3585.83,"location":2,"content":"okay, I'm not that impressed by, you know,"},{"from":3585.83,"to":3588.5,"location":2,"content":"having the performance be slightly the same with one model versus"},{"from":3588.5,"to":3591.59,"location":2,"content":"10 separate models even though it's nice if you wanna deploy it right,"},{"from":3591.59,"to":3593.26,"location":2,"content":"like, uses less RAM and all of that,"},{"from":3593.26,"to":3594.97,"location":2,"content":"assuming they're the same size,"},{"from":3594.97,"to":3597.08,"location":2,"content":"uh, while, you know, one-tenth the size."},{"from":3597.08,"to":3600.71,"location":2,"content":"But what I'm excited about is more like the next couple of results."},{"from":3600.71,"to":3602.75,"location":2,"content":"And namely, sort of this transfer learning,"},{"from":3602.75,"to":3604.55,"location":2,"content":"domain adaptation, and zero-shot,"},{"from":3604.55,"to":3606.02,"location":2,"content":"uh, these kinds of capabilities."},{"from":3606.02,"to":3611.63,"location":2,"content":"So here, uh, we chose two data sets that weren't included in the original 10."},{"from":3611.63,"to":3617.8,"location":2,"content":"And we basically trained a pre-trained model on this versus a random model."},{"from":3617.8,"to":3620.51,"location":2,"content":"And, uh, randomly here again,"},{"from":3620.51,"to":3621.86,"location":2,"content":"they're the same architecture,"},{"from":3621.86,"to":3625.3,"location":2,"content":"and pre-trained means the entirety of the model was pre-trained."},{"from":3625.3,"to":3626.95,"location":2,"content":"All the, you know,"},{"from":3626.95,"to":3631.32,"location":2,"content":"encoders including the decoder in the Softmax and everything, uh,"},{"from":3631.32,"to":3636.14,"location":2,"content":"and to two other tasks where another IWSLT language pair namely,"},{"from":3636.14,"to":3637.68,"location":2,"content":"translating from English to Czech, uh,"},{"from":3637.68,"to":3640.88,"location":2,"content":"and named entity recognition tasks that you all know very well."},{"from":3640.88,"to":3643.46,"location":2,"content":"So basically what we found is that,"},{"from":3643.46,"to":3645.93,"location":2,"content":"uh, it converges much more quickly,"},{"from":3645.93,"to":3647.81,"location":2,"content":"uh, in the beginning, uh, and then,"},{"from":3647.81,"to":3651.2,"location":2,"content":"there's still a significant but not gigantic gap."},{"from":3651.2,"to":3655.59,"location":2,"content":"So this pre-training on these completely separate kinds of task had helped."},{"from":3655.59,"to":3658.74,"location":2,"content":"And, uh, I think that's,"},{"from":3658.74,"to":3660.36,"location":2,"content":"that's pretty exciting, um,"},{"from":3660.36,"to":3662.42,"location":2,"content":"especially sort of the quicker convergence, like,"},{"from":3662.42,"to":3664.16,"location":2,"content":"learning more quickly, uh,"},{"from":3664.16,"to":3666.31,"location":2,"content":"whatever new task you, you come up with,"},{"from":3666.31,"to":3669.01,"location":2,"content":"which also means in some cases you can get away with"},{"from":3669.01,"to":3671.95,"location":2,"content":"less training data on these new- on these new tasks."},{"from":3671.95,"to":3675.97,"location":2,"content":"Uh, now domain adaptation is kind of the simpler form of transfer learning,"},{"from":3675.97,"to":3679.28,"location":2,"content":"where you basically just have a different,"},{"from":3679.28,"to":3681.41,"location":2,"content":"uh, type of, uh,"},{"from":3681.41,"to":3683.06,"location":2,"content":"you know, distribution for your words."},{"from":3683.06,"to":3686.75,"location":2,"content":"Uh, we mentioned we have the Stanford Sentiment Treebank for sentiment analysis."},{"from":3686.75,"to":3689.78,"location":2,"content":"Uh, and then we analyze this on different,"},{"from":3689.78,"to":3691.61,"location":2,"content":"uh, sentiment data sets,"},{"from":3691.61,"to":3694.51,"location":2,"content":"namely Amazon product reviews and Yelp restaurant reviews,"},{"from":3694.51,"to":3696.61,"location":2,"content":"and out of the box without any training,"},{"from":3696.61,"to":3699.97,"location":2,"content":"the model just got 80% accuracy on both of those data sets."},{"from":3699.97,"to":3702.32,"location":2,"content":"Uh, and I think for practitioners,"},{"from":3702.32,"to":3705.14,"location":2,"content":"that is pretty exciting because you basically didn't have to train anything,"},{"from":3705.14,"to":3706.61,"location":2,"content":"it just kind of worked out of the box,"},{"from":3706.61,"to":3708.83,"location":2,"content":"download it from GitHub, and run it."},{"from":3708.83,"to":3711.62,"location":2,"content":"Uh, SNLI, that was slightly different."},{"from":3711.62,"to":3713.33,"location":2,"content":"It didn't quite work as well."},{"from":3713.33,"to":3715.28,"location":2,"content":"It's another natural language inference data set,"},{"from":3715.28,"to":3719.14,"location":2,"content":"but has very different- a very different distribution, different, uh,"},{"from":3719.14,"to":3721.04,"location":2,"content":"kinds of domains, uh, that,"},{"from":3721.04,"to":3723.29,"location":2,"content":"uh, these entailment questions are asked over."},{"from":3723.29,"to":3726.98,"location":2,"content":"Uh, and here, out of the box it achieved 62."},{"from":3726.98,"to":3730.2,"location":2,"content":"Uh, but then, uh, once you fine tuned it and"},{"from":3730.2,"to":3734.23,"location":2,"content":"similar to these experiments here continue to actually train on this data set,"},{"from":3734.23,"to":3737.68,"location":2,"content":"it quickly uh, converged to 87 which was"},{"from":3737.68,"to":3741.63,"location":2,"content":"still two percent gain over a randomlyor initialized McCann model. Yeah."},{"from":3741.63,"to":3749.07,"location":2,"content":"In that experiment, did you evaluate how much less data you can get away with?"},{"from":3749.07,"to":3752.9,"location":2,"content":"Did we evaluate how much less data we can get away with? We didn't."},{"from":3752.9,"to":3755.51,"location":2,"content":"And in some ways, whenever you would run this experiment,"},{"from":3755.51,"to":3758,"location":2,"content":"you'd basically be like, you'd still not do as well."},{"from":3758,"to":3761.55,"location":2,"content":"Like, everything- all these models will still do better with more training data."},{"from":3761.55,"to":3763.64,"location":2,"content":"So you just kind of, it would be a fuzzy kind of say,"},{"from":3763.64,"to":3766.22,"location":2,"content":"like, cut- fuzzy sort of result, right?"},{"from":3766.22,"to":3768.14,"location":2,"content":"Where you say, well, with one-tenth we might get"},{"from":3768.14,"to":3770.89,"location":2,"content":"to 50 and the other model might get only to 40,"},{"from":3770.89,"to":3772.16,"location":2,"content":"doing something like that."},{"from":3772.16,"to":3774.83,"location":2,"content":"Um, we don't- I don't have those numbers."},{"from":3774.83,"to":3777.38,"location":2,"content":"It would be kind of actually also a neat, neat, uh,"},{"from":3777.38,"to":3779.75,"location":2,"content":"analysis to do. Yeah."},{"from":3779.75,"to":3786.84,"location":2,"content":"So if you wanted to like train on a new task [inaudible]."},{"from":3786.84,"to":3787.93,"location":2,"content":"Yeah."},{"from":3787.93,"to":3790.16,"location":2,"content":"[inaudible] ."},{"from":3790.16,"to":3793.11,"location":2,"content":"So, do we have the code to train a new task? Yes, we do."},{"from":3793.11,"to":3794.7,"location":2,"content":"Um, you can just, uh, edit,"},{"from":3794.7,"to":3796.8,"location":2,"content":"make it into this format using context."},{"from":3796.8,"to":3799.47,"location":2,"content":"Here's a question, simple like CSV type format,"},{"from":3799.47,"to":3804.16,"location":2,"content":"and then you add it and you can both like train the pre-trained model yourself."},{"from":3804.16,"to":3808.69,"location":2,"content":"You can download a pre-trained model and just add it. So I'll look it up, yeah."},{"from":3808.69,"to":3814.8,"location":2,"content":"Do you know how this compares to using other kinds of pre-trained representations like, say BERT?"},{"from":3814.8,"to":3817.33,"location":2,"content":"So, um, it's a great question."},{"from":3817.33,"to":3820.12,"location":2,"content":"So how does this compare to other pre-trained representations like BERT?"},{"from":3820.12,"to":3821.93,"location":2,"content":"So, in some ways,"},{"from":3821.93,"to":3824.2,"location":2,"content":"people say BERT is kind of this model that does everything,"},{"from":3824.2,"to":3826.69,"location":2,"content":"but when you actually read the paper, you realize, well,"},{"from":3826.69,"to":3829.93,"location":2,"content":"it's a separate model for these different tasks, right?"},{"from":3829.93,"to":3832.38,"location":2,"content":"If you wanted to have a classification task,"},{"from":3832.38,"to":3834.07,"location":2,"content":"you have a little token in the beginning,"},{"from":3834.07,"to":3835.33,"location":2,"content":"and you have a different top layer."},{"from":3835.33,"to":3837.4,"location":2,"content":"If you wanna do a sequence labeling task,"},{"from":3837.4,"to":3838.45,"location":2,"content":"you have a different top layer."},{"from":3838.45,"to":3840.4,"location":2,"content":"If you wanted to do a sequence extraction task,"},{"from":3840.4,"to":3841.76,"location":2,"content":"you have a different top layer."},{"from":3841.76,"to":3846.22,"location":2,"content":"So, BERT isn't actually a single model for all of these different tasks."},{"from":3846.22,"to":3848.41,"location":2,"content":"Ah, and then, on all the results,"},{"from":3848.41,"to":3851.8,"location":2,"content":"there's a lot of extra tuning for each of the data sets,"},{"from":3851.8,"to":3853.76,"location":2,"content":"and tasks, uh, that, you know,"},{"from":3853.76,"to":3856.03,"location":2,"content":"different learning rate for this task, uh,"},{"from":3856.03,"to":3859.12,"location":2,"content":"different size, or different sets of BERT, and so on."},{"from":3859.12,"to":3861.67,"location":2,"content":"So, we're also super excited, we're like maybe this is it,"},{"from":3861.67,"to":3863.59,"location":2,"content":"we'll just run everything on BERT,"},{"from":3863.59,"to":3865.18,"location":2,"content":"and then we looked into all the details,"},{"from":3865.18,"to":3866.92,"location":2,"content":"and there's so much excitement in the beginning."},{"from":3866.92,"to":3869.02,"location":2,"content":"And then the more we dug through the details,"},{"from":3869.02,"to":3871.8,"location":2,"content":"the less excited we became as this being like sort of the answer,"},{"from":3871.8,"to":3873.58,"location":2,"content":"because it is not a single model."},{"from":3873.58,"to":3876.88,"location":2,"content":"Uh, in some ways, it's probably better to- for pre-training."},{"from":3876.88,"to":3878.29,"location":2,"content":"So instead of CoVe,"},{"from":3878.29,"to":3881.14,"location":2,"content":"you can have kind of BERT at the very beginning,"},{"from":3881.14,"to":3883.45,"location":2,"content":"and my hunch is everything will get slightly better,"},{"from":3883.45,"to":3886.07,"location":2,"content":"but you still need to have, um,"},{"from":3886.07,"to":3892.12,"location":2,"content":"a lot of the- a lot of the other sort of modeling architecture on top of it."},{"from":3892.12,"to":3896.05,"location":2,"content":"Uh, and then the sad thing is to really get the state of the art results,"},{"from":3896.05,"to":3900.36,"location":2,"content":"there's a lot of very spec- task-specific tuning of those last top layers."},{"from":3900.36,"to":3904.53,"location":2,"content":"So, if you try to unify that task-specific tuning,"},{"from":3904.53,"to":3906.7,"location":2,"content":"you lose a lot of the good performance of BERT."},{"from":3906.7,"to":3910.49,"location":2,"content":"Um, so, unfortunately, it's not quite the sort of,"},{"from":3910.49,"to":3912.18,"location":2,"content":"\"Oh, just use BERT for it,"},{"from":3912.18,"to":3915.22,"location":2,"content":"and you'll just have state-of-the-art numbers and all the things.\""},{"from":3915.22,"to":3918.57,"location":2,"content":"Um, I could probably go like talk about it a lot more, but, uh,"},{"from":3918.57,"to":3921.3,"location":2,"content":"I think it still makes sense to think about, um,"},{"from":3921.3,"to":3923.07,"location":2,"content":"some of the ideas from BERT,"},{"from":3923.07,"to":3926.36,"location":2,"content":"like basically, add as one of the tasks language modeling."},{"from":3926.36,"to":3930.99,"location":2,"content":"That would be very likely the task that helps the most for all the other tasks,"},{"from":3930.99,"to":3933.48,"location":2,"content":"and we should include that, uh,"},{"from":3933.48,"to":3937.53,"location":2,"content":"it also would be nice to have a faster model right now."},{"from":3937.53,"to":3940.27,"location":2,"content":"Um, it's hard to do language modeling is very, very large,"},{"from":3940.27,"to":3941.74,"location":2,"content":"it benefits even more from,"},{"from":3941.74,"to":3943.84,"location":2,"content":"you know, billions and billions of words."},{"from":3943.84,"to":3945.67,"location":2,"content":"It's hard to train the McCann model,"},{"from":3945.67,"to":3948.94,"location":2,"content":"this current question answering model of the co-attention mechanism of the question"},{"from":3948.94,"to":3952.03,"location":2,"content":"with like an increasingly large context."},{"from":3952.03,"to":3954.97,"location":2,"content":"So you'd have to kind of split it also like BERT,"},{"from":3954.97,"to":3959.02,"location":2,"content":"works also reasonably well only for like at most I think 500 words or so,"},{"from":3959.02,"to":3962.05,"location":2,"content":"and if you wanted to do summarization you'd basically have to cut"},{"from":3962.05,"to":3966.49,"location":2,"content":"the original document to only 500 words, and then try to summarize it."},{"from":3966.49,"to":3969.82,"location":2,"content":"So, there are a lot of like devil in the details that they didn't have to figure out,"},{"from":3969.82,"to":3972.52,"location":2,"content":"because they said, \"Well, we'll just sort of just like word vectors,"},{"from":3972.52,"to":3976.42,"location":2,"content":"we can take them in, and then we do a lot of other stuff that is task-specific,"},{"from":3976.42,"to":3978.78,"location":2,"content":"um, with those- those word vectors,"},{"from":3978.78,"to":3980.35,"location":2,"content":"or with the BERT architecture.\""},{"from":3980.35,"to":3982.72,"location":2,"content":"I still- I don't want to- this BERT is obviously amazing,"},{"from":3982.72,"to":3985.12,"location":2,"content":"and we are looking into trying to use ideas from it."},{"from":3985.12,"to":3987.4,"location":2,"content":"But unfortunately, it wasn't just sort of a silver bullet to"},{"from":3987.4,"to":3993.36,"location":2,"content":"solve multi-task learning. Mm-hmm?"},{"from":3993.36,"to":3995.51,"location":2,"content":"Pre-training process to be considered, uh,"},{"from":3995.51,"to":4000.99,"location":2,"content":"prioritized sampling based off of how much fewer group, how much loss there is?"},{"from":4000.99,"to":4002.67,"location":2,"content":"Sorry, did we- say again?"},{"from":4002.67,"to":4006.39,"location":2,"content":"Would you consider prioritizing sampling [inaudible]?"},{"from":4006.39,"to":4008.37,"location":2,"content":"So, did we consider prioritizing the sampling?"},{"from":4008.37,"to":4011.76,"location":2,"content":"So in some ways with this pre-trained strategy here, um,"},{"from":4011.76,"to":4016.5,"location":2,"content":"that's kind of what we did by basically focusing on these really hard tasks."},{"from":4016.5,"to":4022.14,"location":2,"content":"And, uh, a lot of like the gap in the end was improved by really waiting for,"},{"from":4022.14,"to":4024.55,"location":2,"content":"like four of the tasks at the very end,"},{"from":4024.55,"to":4025.99,"location":2,"content":"uh, bef- unti- you know, uh,"},{"from":4025.99,"to":4028.56,"location":2,"content":"until after you're gone through, uh,"},{"from":4028.56,"to":4030.75,"location":2,"content":"sort of oversampling all of these,"},{"from":4030.75,"to":4031.8,"location":2,"content":"uh, really hard tasks."},{"from":4031.8,"to":4036.38,"location":2,"content":"In the last 10 minutes, uh, basically, uh,"},{"from":4036.38,"to":4038.4,"location":2,"content":"th- the most exciting thing, uh,"},{"from":4038.4,"to":4042.54,"location":2,"content":"for- for last though I think you could also do a lot more work in this direction."},{"from":4042.54,"to":4044.46,"location":2,"content":"Uh, I mentioned the sole question pointer"},{"from":4044.46,"to":4046.38,"location":2,"content":"and zero short learning in the beginning, and, uh,"},{"from":4046.38,"to":4049.97,"location":2,"content":"we basically just tried to play around with that a little bit, um,"},{"from":4049.97,"to":4052.18,"location":2,"content":"and found that in some cases,"},{"from":4052.18,"to":4055.08,"location":2,"content":"it actually kind of magically works."},{"from":4055.08,"to":4057.06,"location":2,"content":"Uh, so here, we tried, uh,"},{"from":4057.06,"to":4058.72,"location":2,"content":"a sentence John had a party,"},{"from":4058.72,"to":4060.86,"location":2,"content":"but no one came, and he was all alone."},{"from":4060.86,"to":4063.96,"location":2,"content":"And then we asked, \"Is this story sad, or happy?\""},{"from":4063.96,"to":4066.12,"location":2,"content":"And while the model could've, you know,"},{"from":4066.12,"to":4067.92,"location":2,"content":"generate some random German words,"},{"from":4067.92,"to":4069.57,"location":2,"content":"or some random SQL words,"},{"from":4069.57,"to":4071.24,"location":2,"content":"or it's just said whatever,"},{"from":4071.24,"to":4074.49,"location":2,"content":"it actually pointed to, of all the words,"},{"from":4074.49,"to":4076.44,"location":2,"content":"you could've pointed to in the context or the question that"},{"from":4076.44,"to":4078.82,"location":2,"content":"pointed to \"Sad\", which is pretty cool."},{"from":4078.82,"to":4081.75,"location":2,"content":"Like- and it's just one small sample,"},{"from":4081.75,"to":4083.58,"location":2,"content":"and, you know, you could do a lot more,"},{"from":4083.58,"to":4088.91,"location":2,"content":"you could try to come up with a very large zero-shot kind of classification data set,"},{"from":4088.91,"to":4090.3,"location":2,"content":"which is actually kind of hard too."},{"from":4090.3,"to":4092.55,"location":2,"content":"You have to be quite creative, it's not like you can just say, \"Oh,"},{"from":4092.55,"to":4093.75,"location":2,"content":"it would just take all these reviews,"},{"from":4093.75,"to":4095.7,"location":2,"content":"and label them as these, you know, positive negative."},{"from":4095.7,"to":4099.81,"location":2,"content":"Ah, but so, I think we- we need to do more work in that direction."},{"from":4099.81,"to":4103.23,"location":2,"content":"Somebody will hopefully create a zero-shot kind of task data set,"},{"from":4103.23,"to":4105.57,"location":2,"content":"that is not just zero-shot for, you know,"},{"from":4105.57,"to":4109.05,"location":2,"content":"kind of new distributions or something with completely different, uh, outputs."},{"from":4109.05,"to":4111.81,"location":2,"content":"Uh, but we- we tried a couple,"},{"from":4111.81,"to":4112.95,"location":2,"content":"and it doesn't always work, right."},{"from":4112.95,"to":4114.51,"location":2,"content":"You can be adversarial about it,"},{"from":4114.51,"to":4118.47,"location":2,"content":"you can make this basically looks most similar to,"},{"from":4118.47,"to":4120.51,"location":2,"content":"is the sentiment positive or negative?"},{"from":4120.51,"to":4122.81,"location":2,"content":"Uh, is this sen- is this sentence positive or negative?"},{"from":4122.81,"to":4125.95,"location":2,"content":"That was the formalism we had for sentiment analysis."},{"from":4125.95,"to":4127.66,"location":2,"content":"And so you could,"},{"from":4127.66,"to":4130.38,"location":2,"content":"if you make the question more and more different,"},{"from":4130.38,"to":4132,"location":2,"content":"eventually, it'll kinda get tripped up."},{"from":4132,"to":4135.02,"location":2,"content":"Ah, and it's clear that it's benefited, uh,"},{"from":4135.02,"to":4137.01,"location":2,"content":"from the word vectors,"},{"from":4137.01,"to":4139.02,"location":2,"content":"of sad being closer to negative,"},{"from":4139.02,"to":4141.36,"location":2,"content":"and then understanding sort of through all these,"},{"from":4141.36,"to":4143.72,"location":2,"content":"uh, correlations, and- and, uh,"},{"from":4143.72,"to":4148.92,"location":2,"content":"deep representations that there are other sort of sad words in this context,"},{"from":4148.92,"to":4150.12,"location":2,"content":"or- or whatever it is."},{"from":4150.12,"to":4152.37,"location":2,"content":"Uh, and so, it was able to point to this."},{"from":4152.37,"to":4154.74,"location":2,"content":"But you can be adversarial, it doesn't always work."},{"from":4154.74,"to":4156.78,"location":2,"content":"But even the fact that, uh,"},{"from":4156.78,"to":4160.34,"location":2,"content":"it was sort of zero-shot classification based on word vectors, uh,"},{"from":4160.34,"to":4162.15,"location":2,"content":"for new kinds of questions,"},{"from":4162.15,"to":4164.07,"location":2,"content":"uh, personally, it was very exciting to me."},{"from":4164.07,"to":4166.17,"location":2,"content":"And we tried a couple of other things like,"},{"from":4166.17,"to":4168.61,"location":2,"content":"uh, Bryan gave a talk and nobody clapped."},{"from":4168.61,"to":4169.65,"location":2,"content":"Was Bryan happy, or sad?"},{"from":4169.65,"to":4170.67,"location":2,"content":"And it also got it right."},{"from":4170.67,"to":4173.3,"location":2,"content":"So, um, there are a couple- a couple of the,"},{"from":4173.3,"to":4176.19,"location":2,"content":"the examples were, were at least as happy or sad thing worked."},{"from":4176.19,"to":4179.3,"location":2,"content":"And then, uh, a couple of other sort of adjective questions that we,"},{"from":4179.3,"to":4180.78,"location":2,"content":"we tried but, um,"},{"from":4180.78,"to":4183.69,"location":2,"content":"what I'm- what I would be most excited about is eventually actually"},{"from":4183.69,"to":4187.76,"location":2,"content":"trying to have a zero-shot classification task,"},{"from":4187.76,"to":4189.68,"location":2,"content":"uh, that combines the different tasks too."},{"from":4189.68,"to":4192.54,"location":2,"content":"So, uh, unfortunately, there's no data set for that,"},{"from":4192.54,"to":4194.46,"location":2,"content":"so we didn't train it, so it doesn't happen with the model."},{"from":4194.46,"to":4197.73,"location":2,"content":"But in theory, if you ask what is the sum- you can summarize,"},{"from":4197.73,"to":4199.99,"location":2,"content":"and you can translate from English into German,"},{"from":4199.99,"to":4202.52,"location":2,"content":"why couldn't you ask the model for a German summary?"},{"from":4202.52,"to":4204.24,"location":2,"content":"And if that worked, eventually,"},{"from":4204.24,"to":4205.65,"location":2,"content":"that would be even more amazing,"},{"from":4205.65,"to":4207.39,"location":2,"content":"but it, it doesn't work right now,"},{"from":4207.39,"to":4209.19,"location":2,"content":"because we never ask it sort of for these"},{"from":4209.19,"to":4212.31,"location":2,"content":"compositional task- these compositional task questions."},{"from":4212.31,"to":4215.49,"location":2,"content":"But is yet another interesting line of research that I think could spawn from this."},{"from":4215.49,"to":4216.68,"location":2,"content":"Uh, all right."},{"from":4216.68,"to":4219.15,"location":2,"content":"So, I hope I could show you that this sort of"},{"from":4219.15,"to":4224.13,"location":2,"content":"decaNLP framework is an interesting new benchmark for generalized NLP."},{"from":4224.13,"to":4227.16,"location":2,"content":"Uh, I do think it's a reasonably good framework"},{"from":4227.16,"to":4230.31,"location":2,"content":"for tackling a bunch of the really hard questions in the field."},{"from":4230.31,"to":4232.26,"location":2,"content":"Uh, more general language understanding,"},{"from":4232.26,"to":4233.55,"location":2,"content":"and question answering of course,"},{"from":4233.55,"to":4237.18,"location":2,"content":"uh, multitask learning, domain adaptation, uh,"},{"from":4237.18,"to":4239.79,"location":2,"content":"which we sort of analyzed a little bit with the sentiment,"},{"from":4239.79,"to":4241.81,"location":2,"content":"and SNLI versus multi NLI,"},{"from":4241.81,"to":4244.71,"location":2,"content":"um, transfer learning, and then weight sharing."},{"from":4244.71,"to":4246.78,"location":2,"content":"I think it's clear, everybody loves weight sharing,"},{"from":4246.78,"to":4248.85,"location":2,"content":"you wanna share as many weights as possible."},{"from":4248.85,"to":4252.38,"location":2,"content":"Uh, word vector started at, uh, ELMo,"},{"from":4252.38,"to":4255.3,"location":2,"content":"CoVe, and now BERT basically share more and more,"},{"from":4255.3,"to":4256.55,"location":2,"content":"deeper and deeper layers."},{"from":4256.55,"to":4259.56,"location":2,"content":"It would be great if we can unify that last bit also, uh,"},{"from":4259.56,"to":4262.57,"location":2,"content":"and then share basically the entirety of the networks,"},{"from":4262.57,"to":4265.2,"location":2,"content":"and then eventually hopefully get to zero-shot learning."},{"from":4265.2,"to":4267.33,"location":2,"content":"Now, there's a bunch of related work."},{"from":4267.33,"to":4269.22,"location":2,"content":"The original paper has over 100,"},{"from":4269.22,"to":4271.73,"location":2,"content":"um, citations in it, uh, of,"},{"from":4271.73,"to":4273.52,"location":2,"content":"of, you know, papers to other,"},{"from":4273.52,"to":4276.4,"location":2,"content":"other, um, lines of, uh, work."},{"from":4276.4,"to":4278.49,"location":2,"content":"But, uh, this is actually zero- at least some of"},{"from":4278.49,"to":4281.67,"location":2,"content":"the models and papers that influenced us the most,"},{"from":4281.67,"to":4283.92,"location":2,"content":"uh, in, in our thinking and modelling."},{"from":4283.92,"to":4285.47,"location":2,"content":"Uh, one of them actually comes from,"},{"from":4285.47,"to":4287.55,"location":2,"content":"uh, the two instructors of the class."},{"from":4287.55,"to":4291.16,"location":2,"content":"And so, um, hopefully, uh, we can,"},{"from":4291.16,"to":4295.05,"location":2,"content":"you know, sort of think about what- what's next after all this architecture engineering."},{"from":4295.05,"to":4298.13,"location":2,"content":"And, uh, I think one potential answer to that, uh,"},{"from":4298.13,"to":4302.4,"location":2,"content":"is single multitask learning for more generalized NLP models."},{"from":4302.4,"to":4313.62,"location":2,"content":"[NOISE] All right. Thank you. [APPLAUSE]"}]} \ No newline at end of file diff --git a/bcc-en/18.bcc b/bcc-en/18.bcc new file mode 100644 index 0000000000000000000000000000000000000000..83dc57b2af5749e7e45797000719a1d2b5a837d3 --- /dev/null +++ b/bcc-en/18.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.11,"to":10.29,"location":2,"content":"okay hi everyone let's get"},{"from":10.29,"to":14.62,"location":2,"content":"okay so so for today's lecture what"},{"from":14.62,"to":17.26,"location":2,"content":"we're gonna do is look at the topic of"},{"from":17.26,"to":19.93,"location":2,"content":"having tree recursive neural networks I"},{"from":19.93,"to":23.29,"location":2,"content":"mean this is actually a topic which I"},{"from":23.29,"to":26.05,"location":2,"content":"feel especially fond of and attached to"},{"from":26.05,"to":28.84,"location":2,"content":"because actually when we started doing"},{"from":28.84,"to":31.41,"location":2,"content":"deep learning friend or P at Stanford in"},{"from":31.41,"to":34.65,"location":2,"content":"2010 really for the sort of period from"},{"from":34.65,"to":39.43,"location":2,"content":"2010 to 2015 the dominant set of ideas"},{"from":39.43,"to":41.8,"location":2,"content":"that we were working on was this topic"},{"from":41.8,"to":44.56,"location":2,"content":"of how you could build a recursive tree"},{"from":44.56,"to":47.35,"location":2,"content":"structure into neural networks so in a"},{"from":47.35,"to":49.09,"location":2,"content":"way it's kind of funny that I'm only"},{"from":49.09,"to":51.73,"location":2,"content":"getting to it now I mean there are sort"},{"from":51.73,"to":54.34,"location":2,"content":"of reasons for that that I think there"},{"from":54.34,"to":56.98,"location":2,"content":"are a bunch of interesting ideas here"},{"from":56.98,"to":59.53,"location":2,"content":"which relate closely to linguistic"},{"from":59.53,"to":61.87,"location":2,"content":"structure and so it's good stuff to have"},{"from":61.87,"to":65.77,"location":2,"content":"seen but in practice these ideas have"},{"from":65.77,"to":68.53,"location":2,"content":"proven kind of hard to scale and not"},{"from":68.53,"to":71.14,"location":2,"content":"necessarily to work better in practice"},{"from":71.14,"to":73.9,"location":2,"content":"than the kind of things that we've spent"},{"from":73.9,"to":76.75,"location":2,"content":"more time on meaning things like looking"},{"from":76.75,"to":78.97,"location":2,"content":"at LST M's and looking at transformers"},{"from":78.97,"to":81.49,"location":2,"content":"and things like that and so that's kind"},{"from":81.49,"to":84.1,"location":2,"content":"of why we sort of shunted them towards"},{"from":84.1,"to":86.41,"location":2,"content":"the end of the curriculum but I want to"},{"from":86.41,"to":88.12,"location":2,"content":"sort of say something about the"},{"from":88.12,"to":90.28,"location":2,"content":"motivations and the way you can build"},{"from":90.28,"to":92.11,"location":2,"content":"tree structures and neural networks and"},{"from":92.11,"to":94.6,"location":2,"content":"look at some of the possibilities we"},{"from":94.6,"to":99.31,"location":2,"content":"explored in during this class another"},{"from":99.31,"to":101.77,"location":2,"content":"fact about this class is actually this"},{"from":101.77,"to":105.1,"location":2,"content":"is the last class I'm going to give so"},{"from":105.1,"to":107.14,"location":2,"content":"two more classes in the next week don't"},{"from":107.14,"to":110.47,"location":2,"content":"forget about next week's cs2 24 in"},{"from":110.47,"to":115.18,"location":2,"content":"classes but on Tuesday we've gone in the"},{"from":115.18,"to":117.16,"location":2,"content":"final invited speaker make Mitchell"},{"from":117.16,"to":119.41,"location":2,"content":"who's a great speaker and has tons of"},{"from":119.41,"to":122.08,"location":2,"content":"interesting stuff to say about fairness"},{"from":122.08,"to":125.53,"location":2,"content":"and ethics in NOP nai and then for the"},{"from":125.53,"to":127.87,"location":2,"content":"final lectures one another my PhD"},{"from":127.87,"to":130,"location":2,"content":"students Kevin Clark is going to give"},{"from":130,"to":132.91,"location":2,"content":"that and talk about some of the recent"},{"from":132.91,"to":134.62,"location":2,"content":"what's been happening in deep learning"},{"from":134.62,"to":138.07,"location":2,"content":"in 2018-19 or some of the sort of recent"},{"from":138.07,"to":141.19,"location":2,"content":"developments in NLP and deep learning so"},{"from":141.19,"to":142.58,"location":2,"content":"as"},{"from":142.58,"to":144.29,"location":2,"content":"I'll say my farewells at the end of this"},{"from":144.29,"to":148.03,"location":2,"content":"one so hopefully everyone has submitted"},{"from":148.03,"to":152.78,"location":2,"content":"their a milestone for their final"},{"from":152.78,"to":154.52,"location":2,"content":"project if you haven't you should really"},{"from":154.52,"to":157.67,"location":2,"content":"be getting your milestone in you know"},{"from":157.67,"to":159.77,"location":2,"content":"it's inevitable that somewhere around"},{"from":159.77,"to":163.55,"location":2,"content":"here there start to be problems that"},{"from":163.55,"to":165.86,"location":2,"content":"people have a situation that nothing"},{"from":165.86,"to":167.81,"location":2,"content":"works and everything is too slow and you"},{"from":167.81,"to":173.09,"location":2,"content":"panic and this happens I wish you luck"},{"from":173.09,"to":175.31,"location":2,"content":"of course I mean what can you do about"},{"from":175.31,"to":178.01,"location":2,"content":"it I mean it can be really hard when you"},{"from":178.01,"to":180.29,"location":2,"content":"have things that don't work as to work"},{"from":180.29,"to":183.17,"location":2,"content":"out why they don't work and how to fix"},{"from":183.17,"to":186.26,"location":2,"content":"them I mean I think often the best thing"},{"from":186.26,"to":188.63,"location":2,"content":"to do is really to go back to something"},{"from":188.63,"to":191.81,"location":2,"content":"simple that you can get working and to"},{"from":191.81,"to":195.11,"location":2,"content":"work forward from there again it also"},{"from":195.11,"to":198.41,"location":2,"content":"really helps to have really small data"},{"from":198.41,"to":201.2,"location":2,"content":"sets I really recommend the strategy of"},{"from":201.2,"to":204.53,"location":2,"content":"sort of having a ten item or 20 item"},{"from":204.53,"to":207.11,"location":2,"content":"data set and checking that your model"},{"from":207.11,"to":209.45,"location":2,"content":"works perfectly over trains to 100%"},{"from":209.45,"to":212.12,"location":2,"content":"accuracy on that kind of data set saves"},{"from":212.12,"to":215.09,"location":2,"content":"you huge amounts of time and it's sort"},{"from":215.09,"to":217.07,"location":2,"content":"of after you've gone something simple"},{"from":217.07,"to":219.35,"location":2,"content":"working on a small amount of data that's"},{"from":219.35,"to":223.13,"location":2,"content":"the right time to serve then expand"},{"from":223.13,"to":224.84,"location":2,"content":"forward again"},{"from":224.84,"to":227.03,"location":2,"content":"you should definitely always make sure"},{"from":227.03,"to":228.89,"location":2,"content":"that you can completely over fit on your"},{"from":228.89,"to":231.56,"location":2,"content":"training data set that sort of and not"},{"from":231.56,"to":233.33,"location":2,"content":"quite a proof but as at least the first"},{"from":233.33,"to":235.4,"location":2,"content":"good requirement for your model being"},{"from":235.4,"to":240.2,"location":2,"content":"implemented properly you you know part"},{"from":240.2,"to":241.97,"location":2,"content":"of the trick of being a successful deep"},{"from":241.97,"to":244.49,"location":2,"content":"learning researcher is actually managing"},{"from":244.49,"to":246.56,"location":2,"content":"to get things done and not wasting a ton"},{"from":246.56,"to":248.78,"location":2,"content":"of time and so it definitely always"},{"from":248.78,"to":251.15,"location":2,"content":"helps just to be you know plotting as"},{"from":251.15,"to":253.13,"location":2,"content":"you go along your training and dev"},{"from":253.13,"to":254.99,"location":2,"content":"errors so that you can sort of tell if"},{"from":254.99,"to":256.94,"location":2,"content":"things are working or if things aren't"},{"from":256.94,"to":259.52,"location":2,"content":"working and you should abandon and start"},{"from":259.52,"to":261.26,"location":2,"content":"again with a new experiment that just"},{"from":261.26,"to":263.66,"location":2,"content":"things like that save you hours and get"},{"from":263.66,"to":266.72,"location":2,"content":"you more done and so then once things"},{"from":266.72,"to":268.25,"location":2,"content":"are working this sort of a whole bunch"},{"from":268.25,"to":269.93,"location":2,"content":"of things to make it work better"},{"from":269.93,"to":272.93,"location":2,"content":"there's regularization with l2 and drop"},{"from":272.93,"to":275.75,"location":2,"content":"out there's time to do hyper parameter"},{"from":275.75,"to":276.88,"location":2,"content":"search"},{"from":276.88,"to":279.98,"location":2,"content":"and you know often doing these things"},{"from":279.98,"to":282.23,"location":2,"content":"and make quite a lot of difference to"},{"from":282.23,"to":284.9,"location":2,"content":"what your final results are and so it's"},{"from":284.9,"to":286.79,"location":2,"content":"good to have time to do those things but"},{"from":286.79,"to":289.37,"location":2,"content":"qu you want to get things working first"},{"from":289.37,"to":292.67,"location":2,"content":"before you go on to that and sort of"},{"from":292.67,"to":294.86,"location":2,"content":"really encourage people to still stop by"},{"from":294.86,"to":296.48,"location":2,"content":"in office hours if you've got any"},{"from":296.48,"to":299.03,"location":2,"content":"problems and we'll try our best to help"},{"from":299.03,"to":301.34,"location":2,"content":"though here within the limitations of"},{"from":301.34,"to":303.41,"location":2,"content":"what we can do from just being hit cold"},{"from":303.41,"to":309.32,"location":2,"content":"with problems okay yeah so I wanted to"},{"from":309.32,"to":311.71,"location":2,"content":"sort of just say some general remarks"},{"from":311.71,"to":317.62,"location":2,"content":"about language and theories of language"},{"from":317.62,"to":320.6,"location":2,"content":"that in the context that motivate this"},{"from":320.6,"to":324.23,"location":2,"content":"tree recursive networks so this is an"},{"from":324.23,"to":326.45,"location":2,"content":"art installation at Carnegie Mellon"},{"from":326.45,"to":329.75,"location":2,"content":"University and as an NLP person I really"},{"from":329.75,"to":332.6,"location":2,"content":"love this art installation so we need"},{"from":332.6,"to":334.64,"location":2,"content":"better art installations around the"},{"from":334.64,"to":337.52,"location":2,"content":"Stanford School of Engineering so this"},{"from":337.52,"to":340.28,"location":2,"content":"is the bag of words art installation"},{"from":340.28,"to":342.29,"location":2,"content":"there's the bag with a lot of words in"},{"from":342.29,"to":344.48,"location":2,"content":"it and you see down here there are the"},{"from":344.48,"to":347.48,"location":2,"content":"stop words the dart and the us that have"},{"from":347.48,"to":350.12,"location":2,"content":"fallen out of the bag and a represented"},{"from":350.12,"to":351.91,"location":2,"content":"on the ground as the stop words"},{"from":351.91,"to":356.59,"location":2,"content":"beautiful artwork right so one of the"},{"from":356.59,"to":359.69,"location":2,"content":"interesting things that has been found"},{"from":359.69,"to":362.9,"location":2,"content":"about NLP models of language and I think"},{"from":362.9,"to":365.36,"location":2,"content":"this is even more true in the deep"},{"from":365.36,"to":366.74,"location":2,"content":"learning world than it used to be"},{"from":366.74,"to":369.8,"location":2,"content":"previously is boy you can do a lot with"},{"from":369.8,"to":372.11,"location":2,"content":"bag of words models right that you can"},{"from":372.11,"to":375.77,"location":2,"content":"just often get a lot of power by saying"},{"from":375.77,"to":378.2,"location":2,"content":"well let's get our new word vectors"},{"from":378.2,"to":380.39,"location":2,"content":"we're gonna average them or max pull"},{"from":380.39,"to":382.46,"location":2,"content":"them or something like this and do"},{"from":382.46,"to":384.44,"location":2,"content":"nothing more and that gives me a pretty"},{"from":384.44,"to":386.72,"location":2,"content":"good sentence representation or document"},{"from":386.72,"to":388.43,"location":2,"content":"representation that I could use in a"},{"from":388.43,"to":390.83,"location":2,"content":"classifier or something and sometimes"},{"from":390.83,"to":392.96,"location":2,"content":"you can do not much more than that and"},{"from":392.96,"to":394.43,"location":2,"content":"get even better so people have done"},{"from":394.43,"to":396.65,"location":2,"content":"things like deep averaging networks"},{"from":396.65,"to":399.2,"location":2,"content":"where you're taking the output of a bag"},{"from":399.2,"to":400.76,"location":2,"content":"of words model and sort of feeding it"},{"from":400.76,"to":401.87,"location":2,"content":"through a couple more layers and"},{"from":401.87,"to":405.73,"location":2,"content":"improving things so that is in complete"},{"from":405.73,"to":408.83,"location":2,"content":"distinction to what's been dominant in"},{"from":408.83,"to":409.62,"location":2,"content":"linguistic"},{"from":409.62,"to":412.13,"location":2,"content":"of looking at language structure that"},{"from":412.13,"to":415.74,"location":2,"content":"typically in linguistics the emphasis"},{"from":415.74,"to":418.83,"location":2,"content":"has been on identifying kind of huge"},{"from":418.83,"to":422.04,"location":2,"content":"amounts of structure of linguistics"},{"from":422.04,"to":424.53,"location":2,"content":"utterances through very complex"},{"from":424.53,"to":426.51,"location":2,"content":"formalisms I guess this is sort of a bit"},{"from":426.51,"to":428.94,"location":2,"content":"of a picture of a Chomsky in minimalism"},{"from":428.94,"to":431.82,"location":2,"content":"syntactic tree and the one at the top is"},{"from":431.82,"to":434.37,"location":2,"content":"a bit of a picture of had driven phrase"},{"from":434.37,"to":436.32,"location":2,"content":"structure grammar theory there was"},{"from":436.32,"to":440.04,"location":2,"content":"predominantly developed at Stanford in"},{"from":440.04,"to":444.18,"location":2,"content":"the in the 90s but so very complex data"},{"from":444.18,"to":446.31,"location":2,"content":"structures and articulator structures"},{"from":446.31,"to":449.28,"location":2,"content":"used to describe linguistics and there's"},{"from":449.28,"to":452.37,"location":2,"content":"a huge gap between these two things"},{"from":452.37,"to":456.18,"location":2,"content":"and you might think that you know surely"},{"from":456.18,"to":457.86,"location":2,"content":"there's some good points in the middle"},{"from":457.86,"to":460.2,"location":2,"content":"where we have a certain amount of"},{"from":460.2,"to":462.36,"location":2,"content":"structure and that's going to help us do"},{"from":462.36,"to":466.77,"location":2,"content":"what we want and so in particular that"},{"from":466.77,"to":468.87,"location":2,"content":"if we wanted to semantically interpret"},{"from":468.87,"to":471.42,"location":2,"content":"language it seems like we don't just"},{"from":471.42,"to":473.79,"location":2,"content":"want to have word vectors we want to"},{"from":473.79,"to":476.28,"location":2,"content":"have meanings of bigger phrases so"},{"from":476.28,"to":478.86,"location":2,"content":"here's the snowboarder at leaping over a"},{"from":478.86,"to":482.04,"location":2,"content":"mogul and a person on a snowboard jumps"},{"from":482.04,"to":484.38,"location":2,"content":"into the air and what we'd like to be"},{"from":484.38,"to":486.36,"location":2,"content":"able to say is that the snowboarder"},{"from":486.36,"to":489.09,"location":2,"content":"means basically the same thing as a"},{"from":489.09,"to":491.52,"location":2,"content":"person on a snowboard so we want to have"},{"from":491.52,"to":494.37,"location":2,"content":"these chunks of language which in"},{"from":494.37,"to":496.8,"location":2,"content":"linguistics of be constituents phrases"},{"from":496.8,"to":499.32,"location":2,"content":"and say that they have a meaning and"},{"from":499.32,"to":501.18,"location":2,"content":"we'd like to be able to compare their"},{"from":501.18,"to":504.06,"location":2,"content":"meaning now we've looked at at least one"},{"from":504.06,"to":506.01,"location":2,"content":"tool that allows us to have chunks of"},{"from":506.01,"to":507.63,"location":2,"content":"language right because we looked at"},{"from":507.63,"to":509.4,"location":2,"content":"convolutional neural networks where you"},{"from":509.4,"to":511.71,"location":2,"content":"could take three words and make a"},{"from":511.71,"to":513.54,"location":2,"content":"representation of the convolutional"},{"from":513.54,"to":516.15,"location":2,"content":"neural network but the fundamental"},{"from":516.15,"to":519.06,"location":2,"content":"difference is that in human languages"},{"from":519.06,"to":520.98,"location":2,"content":"you have these chunks that have meaning"},{"from":520.98,"to":523.56,"location":2,"content":"there are different sizes so we'd like"},{"from":523.56,"to":527.13,"location":2,"content":"to say the snowboarder is pretty much"},{"from":527.13,"to":529.08,"location":2,"content":"semantically equivalent to a person on a"},{"from":529.08,"to":532.17,"location":2,"content":"snowboard but the top one is two words"},{"from":532.17,"to":534.45,"location":2,"content":"long and the bottom one is five words"},{"from":534.45,"to":537,"location":2,"content":"long and so if we're going to be able to"},{"from":537,"to":540.9,"location":2,"content":"do that we somehow want to have these"},{"from":540.9,"to":542.98,"location":2,"content":"sort of constituent chunks"},{"from":542.98,"to":544.78,"location":2,"content":"and be able to work with and represent"},{"from":544.78,"to":547.54,"location":2,"content":"them in neural networks and that's sort"},{"from":547.54,"to":552.46,"location":2,"content":"of the central idea of what motivated"},{"from":552.46,"to":554.14,"location":2,"content":"some of the sort of tree structured"},{"from":554.14,"to":555.73,"location":2,"content":"neural networks are about to show you"},{"from":555.73,"to":559.03,"location":2,"content":"there's another related thing that you"},{"from":559.03,"to":561.28,"location":2,"content":"might want to think about is you know a"},{"from":561.28,"to":564.46,"location":2,"content":"person on a snowboard how do human"},{"from":564.46,"to":567.01,"location":2,"content":"beings manage to understand what that"},{"from":567.01,"to":569.59,"location":2,"content":"means and then a person on a snowboard"},{"from":569.59,"to":570.97,"location":2,"content":"jumps into the air"},{"from":570.97,"to":573.73,"location":2,"content":"how does people manage to understand"},{"from":573.73,"to":577.33,"location":2,"content":"what that means and it sort of seems"},{"from":577.33,"to":581.67,"location":2,"content":"like the only possible answer to this is"},{"from":581.67,"to":584.2,"location":2,"content":"what's normally referred to as the"},{"from":584.2,"to":586.81,"location":2,"content":"principle of compositionality that"},{"from":586.81,"to":589.3,"location":2,"content":"people know the word person they know"},{"from":589.3,"to":591.09,"location":2,"content":"the word on they know the words"},{"from":591.09,"to":593.44,"location":2,"content":"snowboard therefore they can work out"},{"from":593.44,"to":596.53,"location":2,"content":"what on a snowboard means and they can"},{"from":596.53,"to":598.3,"location":2,"content":"work out what person on a snowboard"},{"from":598.3,"to":601.39,"location":2,"content":"means by knowing the meanings of"},{"from":601.39,"to":603.1,"location":2,"content":"components and putting them together"},{"from":603.1,"to":607.24,"location":2,"content":"into bigger pieces there's a there's a"},{"from":607.24,"to":610.81,"location":2,"content":"famous applied mathematician"},{"from":610.81,"to":613.78,"location":2,"content":"statistician at Brown University study"},{"from":613.78,"to":616.21,"location":2,"content":"Minh and I guess the way he summarized"},{"from":616.21,"to":618.37,"location":2,"content":"this is either the principle of"},{"from":618.37,"to":621.82,"location":2,"content":"compositionality is true or God exists"},{"from":621.82,"to":625.51,"location":2,"content":"very shiva's where you take that as you"},{"from":625.51,"to":628.39,"location":2,"content":"as you want but you know I think what he"},{"from":628.39,"to":631.27,"location":2,"content":"meant was well you know you can just"},{"from":631.27,"to":633.28,"location":2,"content":"make these infinite number of infinitely"},{"from":633.28,"to":635.02,"location":2,"content":"long sentences and human beings"},{"from":635.02,"to":637.42,"location":2,"content":"understand them then it just has to be"},{"from":637.42,"to":640.03,"location":2,"content":"that people can know about words and"},{"from":640.03,"to":642.52,"location":2,"content":"waits to combine meanings and and make"},{"from":642.52,"to":644.23,"location":2,"content":"bigger meanings cause you know how else"},{"from":644.23,"to":646.39,"location":2,"content":"could it possibly work that people could"},{"from":646.39,"to":648.94,"location":2,"content":"understand sentences and so we want to"},{"from":648.94,"to":651.04,"location":2,"content":"be able to do that we want to be able to"},{"from":651.04,"to":653.32,"location":2,"content":"work out semantic compositions of"},{"from":653.32,"to":655.45,"location":2,"content":"smaller elements to work out the"},{"from":655.45,"to":658.09,"location":2,"content":"meanings of bigger pieces and that this"},{"from":658.09,"to":661,"location":2,"content":"obviously isn't only a linguistic thing"},{"from":661,"to":663.88,"location":2,"content":"compositionality appears in other places"},{"from":663.88,"to":666.57,"location":2,"content":"as well right so if you want to"},{"from":666.57,"to":669.16,"location":2,"content":"understand how some piece of machinery"},{"from":669.16,"to":672.01,"location":2,"content":"works what you kind of want to know is"},{"from":672.01,"to":674.47,"location":2,"content":"it has different sub components and if"},{"from":674.47,"to":676.62,"location":2,"content":"you can understand how the differ"},{"from":676.62,"to":678.48,"location":2,"content":"subcomponents work and how they're"},{"from":678.48,"to":681.24,"location":2,"content":"fitted together then you might have some"},{"from":681.24,"to":683.34,"location":2,"content":"understanding of how the whole scene"},{"from":683.34,"to":689.01,"location":2,"content":"works and compositionality seems to be"},{"from":689.01,"to":691.92,"location":2,"content":"worked at work in vision as well so here"},{"from":691.92,"to":694.47,"location":2,"content":"is a scene and again it seems like this"},{"from":694.47,"to":696.48,"location":2,"content":"scene has parts so there are little"},{"from":696.48,"to":698.94,"location":2,"content":"parts that go together right so there"},{"from":698.94,"to":700.62,"location":2,"content":"are people that go together into a crowd"},{"from":700.62,"to":703.11,"location":2,"content":"of people and there's a roofer and a"},{"from":703.11,"to":704.91,"location":2,"content":"second floor and another bit of roof and"},{"from":704.91,"to":707.58,"location":2,"content":"the first floor that go together into a"},{"from":707.58,"to":709.65,"location":2,"content":"picture of this church and so this is"},{"from":709.65,"to":712.02,"location":2,"content":"also kind of a compositional scene in"},{"from":712.02,"to":715.53,"location":2,"content":"which pieces go together so it sort of"},{"from":715.53,"to":717.42,"location":2,"content":"seems like certainly for language"},{"from":717.42,"to":719.91,"location":2,"content":"understanding and then really for a lot"},{"from":719.91,"to":721.74,"location":2,"content":"of the other things that we use for"},{"from":721.74,"to":724.08,"location":2,"content":"intelligence then we somehow need to be"},{"from":724.08,"to":726.87,"location":2,"content":"able to understand bigger things from"},{"from":726.87,"to":731.21,"location":2,"content":"knowing about smaller parts yeah so"},{"from":731.21,"to":733.56,"location":2,"content":"computational so the most fun I"},{"from":733.56,"to":735.63,"location":2,"content":"mentioned this earlier sometime the most"},{"from":735.63,"to":739.11,"location":2,"content":"famous linguist is Noam Chomsky at MIT"},{"from":739.11,"to":743.64,"location":2,"content":"and you know really computational"},{"from":743.64,"to":745.86,"location":2,"content":"linguists a lot of the time haven't been"},{"from":745.86,"to":749.16,"location":2,"content":"that friendly to linguistics linguists"},{"from":749.16,"to":751.97,"location":2,"content":"and in particular some of Noam Chomsky's"},{"from":751.97,"to":755.19,"location":2,"content":"theories of language because really he's"},{"from":755.19,"to":758.33,"location":2,"content":"never been sympathetic to the idea of"},{"from":758.33,"to":760.83,"location":2,"content":"machine learning or in general just sort"},{"from":760.83,"to":763.26,"location":2,"content":"of the empirical ability to learn from"},{"from":763.26,"to":766.1,"location":2,"content":"data he's sort of has always been"},{"from":766.1,"to":768.75,"location":2,"content":"wanting to refuse that that exists but"},{"from":768.75,"to":771.03,"location":2,"content":"if we nevertheless look for a little bit"},{"from":771.03,"to":774.72,"location":2,"content":"of insight on that you know this is a"},{"from":774.72,"to":777.15,"location":2,"content":"recent paper of Chomsky's with authors"},{"from":777.15,"to":779.25,"location":2,"content":"and that they're sort of trying to give"},{"from":779.25,"to":782.28,"location":2,"content":"a version of what is unique about human"},{"from":782.28,"to":785.76,"location":2,"content":"language and essentially what they zero"},{"from":785.76,"to":788.73,"location":2,"content":"in on is that well if you're sort of"},{"from":788.73,"to":791.16,"location":2,"content":"looking at you know humans versus other"},{"from":791.16,"to":793.89,"location":2,"content":"fairly intelligent creatures they"},{"from":793.89,"to":795.93,"location":2,"content":"suggest that the defining difference of"},{"from":795.93,"to":799.02,"location":2,"content":"human beings is that they have this"},{"from":799.02,"to":803.13,"location":2,"content":"ability to model recursion and so the"},{"from":803.13,"to":805.94,"location":2,"content":"this paper argues that the the singular"},{"from":805.94,"to":808.08,"location":2,"content":"distinction that allowed language to"},{"from":808.08,"to":810.09,"location":2,"content":"develop in human beings"},{"from":810.09,"to":812.07,"location":2,"content":"that we could put together smaller parts"},{"from":812.07,"to":814.38,"location":2,"content":"to make bigger things in a recursive"},{"from":814.38,"to":816.45,"location":2,"content":"process and that that was the sort of"},{"from":816.45,"to":820.02,"location":2,"content":"defining new ability not sure not sure I"},{"from":820.02,"to":823.14,"location":2,"content":"believe that or not you can decide what"},{"from":823.14,"to":827.4,"location":2,"content":"you think but what I think is certainly"},{"from":827.4,"to":830,"location":2,"content":"the case is if there's just"},{"from":830,"to":832.26,"location":2,"content":"incontrovertible that the structure of"},{"from":832.26,"to":835.38,"location":2,"content":"human language sentences have these"},{"from":835.38,"to":839.64,"location":2,"content":"pieces of constituents that then form"},{"from":839.64,"to":841.83,"location":2,"content":"together hierarchically or recursively"},{"from":841.83,"to":844.86,"location":2,"content":"into bigger pieces as you go up in the"},{"from":844.86,"to":846.96,"location":2,"content":"tree and then particularly you get this"},{"from":846.96,"to":849.81,"location":2,"content":"recursion where you get a little noun"},{"from":849.81,"to":852.72,"location":2,"content":"phrase meat which then appears in a"},{"from":852.72,"to":854.67,"location":2,"content":"bigger noun phrase like spaghetti with"},{"from":854.67,"to":856.74,"location":2,"content":"meat and you can repeat that several"},{"from":856.74,"to":859.23,"location":2,"content":"times giving you a curse of structure"},{"from":859.23,"to":861.93,"location":2,"content":"and I have an example of that in blue up"},{"from":861.93,"to":864.57,"location":2,"content":"the top so the person standing next to"},{"from":864.57,"to":866.16,"location":2,"content":"the man from the company that purchased"},{"from":866.16,"to":868.62,"location":2,"content":"the firm that's used to work at that"},{"from":868.62,"to":873.51,"location":2,"content":"whole thing is big noun phrase but"},{"from":873.51,"to":876.42,"location":2,"content":"inside that there's a noun phrase the"},{"from":876.42,"to":878.07,"location":2,"content":"man from the company that purchased the"},{"from":878.07,"to":880.2,"location":2,"content":"firm that used to work ad which is"},{"from":880.2,"to":882.54,"location":2,"content":"another big noun phrase and well inside"},{"from":882.54,"to":887.19,"location":2,"content":"that there are smaller noun phrase like"},{"from":887.19,"to":888.84,"location":2,"content":"the company that purchased the firm used"},{"from":888.84,"to":891,"location":2,"content":"to work at but you know it's still got"},{"from":891,"to":893.52,"location":2,"content":"inside that noun phrases like the firm"},{"from":893.52,"to":895.68,"location":2,"content":"that you used to work at and actually"},{"from":895.68,"to":898.23,"location":2,"content":"even that's gotten inside at a smaller"},{"from":898.23,"to":901.59,"location":2,"content":"noun phrase which is just the word you"},{"from":901.59,"to":905.49,"location":2,"content":"so a individual pronoun is also a noun"},{"from":905.49,"to":910.61,"location":2,"content":"phrase so this kind of structuring of"},{"from":910.61,"to":913.56,"location":2,"content":"language where you get this sort of"},{"from":913.56,"to":915.72,"location":2,"content":"hierarchical structure at the same kind"},{"from":915.72,"to":917.4,"location":2,"content":"of things inside them I think that's"},{"from":917.4,"to":921.39,"location":2,"content":"just sort of totally totally correct the"},{"from":921.39,"to":923.55,"location":2,"content":"the claim then that you know our"},{"from":923.55,"to":927.18,"location":2,"content":"language is recursive I mean in a formal"},{"from":927.18,"to":929.7,"location":2,"content":"sense it's not quite clear that that's a"},{"from":929.7,"to":934.35,"location":2,"content":"it's a clear thing and that's the reason"},{"from":934.35,"to":937.23,"location":2,"content":"to say something this recursive it has"},{"from":937.23,"to":938.76,"location":2,"content":"to repeat out to infinity"},{"from":938.76,"to":942.24,"location":2,"content":"right so as soon as you put any bound on"},{"from":942.24,"to":943.59,"location":2,"content":"something and you"},{"from":943.59,"to":945.87,"location":2,"content":"say look that's it now I'm afraid you"},{"from":945.87,"to":948.54,"location":2,"content":"just gave me with five levels of nesting"},{"from":948.54,"to":950.94,"location":2,"content":"that's pretty implausible that someone"},{"from":950.94,"to":953.58,"location":2,"content":"is going to say that and so as soon as"},{"from":953.58,"to":955.83,"location":2,"content":"you sort of want to make an argument"},{"from":955.83,"to":958.32,"location":2,"content":"like okay even if they said that no one"},{"from":958.32,"to":959.85,"location":2,"content":"is going to say a noun phrase with ten"},{"from":959.85,"to":961.89,"location":2,"content":"levels of nesting and if you put some"},{"from":961.89,"to":965.28,"location":2,"content":"hard limit on it like that then in some"},{"from":965.28,"to":967.05,"location":2,"content":"sense it's not truly recursive because"},{"from":967.05,"to":970.08,"location":2,"content":"it doesn't go out to infinity but you"},{"from":970.08,"to":971.67,"location":2,"content":"know regardless what you think about"},{"from":971.67,"to":973.92,"location":2,"content":"that that doesn't negate the basic"},{"from":973.92,"to":975.9,"location":2,"content":"argument that you get this hierarchical"},{"from":975.9,"to":978.33,"location":2,"content":"structuring with the same kinds of"},{"from":978.33,"to":980.64,"location":2,"content":"things like noun phrases sentences verb"},{"from":980.64,"to":983.91,"location":2,"content":"phrases appearing inside each other in a"},{"from":983.91,"to":987.63,"location":2,"content":"way that has no clear bound the extent"},{"from":987.63,"to":990.42,"location":2,"content":"that I show you a complex sentence you"},{"from":990.42,"to":992.31,"location":2,"content":"can say I can make them an even bigger"},{"from":992.31,"to":994.83,"location":2,"content":"more complex sentence by putting it"},{"from":994.83,"to":997.38,"location":2,"content":"inside you said to me that and then"},{"from":997.38,"to":1000.68,"location":2,"content":"saying my sentence right so that's the"},{"from":1000.68,"to":1002.63,"location":2,"content":"sense in which it does appear to be a"},{"from":1002.63,"to":1005.3,"location":2,"content":"recursive generative process even though"},{"from":1005.3,"to":1008.21,"location":2,"content":"practically there are limits to how"},{"from":1008.21,"to":1011,"location":2,"content":"complex sentences people say and so"},{"from":1011,"to":1013.37,"location":2,"content":"that's the kind of structure that gets"},{"from":1013.37,"to":1017,"location":2,"content":"captured in these constituency structure"},{"from":1017,"to":1020.09,"location":2,"content":"trees so before the early time when we"},{"from":1020.09,"to":1021.98,"location":2,"content":"talked about parsing and you guys did"},{"from":1021.98,"to":1024.29,"location":2,"content":"some of it I emphasized dependency"},{"from":1024.29,"to":1027.32,"location":2,"content":"parsing but the other kind of parsing"},{"from":1027.32,"to":1029.21,"location":2,"content":"which is actually the kind that the"},{"from":1029.21,"to":1030.83,"location":2,"content":"models I'm going to talk about today I"},{"from":1030.83,"to":1034.25,"location":2,"content":"was using was this idea of what's often"},{"from":1034.25,"to":1037.22,"location":2,"content":"called constituency parsing or linguists"},{"from":1037.22,"to":1038.69,"location":2,"content":"often call at phrase structure grammar"},{"from":1038.69,"to":1043.52,"location":2,"content":"x' or in sort of computer science formal"},{"from":1043.52,"to":1045.65,"location":2,"content":"language theory these are context-free"},{"from":1045.65,"to":1050.09,"location":2,"content":"grammars where we are having these non"},{"from":1050.09,"to":1052.43,"location":2,"content":"terminals like noun phrase and verb"},{"from":1052.43,"to":1054.41,"location":2,"content":"phrase and that's inside anon the noun"},{"from":1054.41,"to":1056.24,"location":2,"content":"phrase that's inside another verb phrase"},{"from":1056.24,"to":1058.37,"location":2,"content":"which is inside more verb phrases"},{"from":1058.37,"to":1061.7,"location":2,"content":"heading up the sentence and so these are"},{"from":1061.7,"to":1064.85,"location":2,"content":"our constituency grammars and when we"},{"from":1064.85,"to":1066.92,"location":2,"content":"occasionally mentioned the pin treebank"},{"from":1066.92,"to":1069.41,"location":2,"content":"tree this was kind of an original pin"},{"from":1069.41,"to":1072.86,"location":2,"content":"treebank tree which is basically for a"},{"from":1072.86,"to":1074.63,"location":2,"content":"structure grammar like this with sort of"},{"from":1074.63,"to":1076.89,"location":2,"content":"various extra annotations"},{"from":1076.89,"to":1081.21,"location":2,"content":"put on the nodes okay so what it's seen"},{"from":1081.21,"to":1084.06,"location":2,"content":"what what you to capture some of these"},{"from":1084.06,"to":1085.98,"location":2,"content":"properties it seems like we'd like to"},{"from":1085.98,"to":1088.71,"location":2,"content":"have a neural model that can make use of"},{"from":1088.71,"to":1090.78,"location":2,"content":"some of this same kind of tree structure"},{"from":1090.78,"to":1094.41,"location":2,"content":"and so what we'd like to do for working"},{"from":1094.41,"to":1097.44,"location":2,"content":"out semantic similarity of constituents"},{"from":1097.44,"to":1100.77,"location":2,"content":"is we want to not only have a word"},{"from":1100.77,"to":1102.9,"location":2,"content":"vector space like we start off with"},{"from":1102.9,"to":1104.79,"location":2,"content":"right at the beginning of the quarter"},{"from":1104.79,"to":1107.72,"location":2,"content":"but we'd like to be able to take bigger"},{"from":1107.72,"to":1110.13,"location":2,"content":"constituents like noun phrases the"},{"from":1110.13,"to":1112.35,"location":2,"content":"country of my birth and the place where"},{"from":1112.35,"to":1115.17,"location":2,"content":"I was born and also give them a meaning"},{"from":1115.17,"to":1117.63,"location":2,"content":"and so it seems like what we'd like to"},{"from":1117.63,"to":1120.09,"location":2,"content":"do is have a method of computing the"},{"from":1120.09,"to":1123.48,"location":2,"content":"meaning of any phrase in a compositional"},{"from":1123.48,"to":1126.3,"location":2,"content":"manner such that the end result is also"},{"from":1126.3,"to":1129.92,"location":2,"content":"that these phrases could be stuck inside"},{"from":1129.92,"to":1132.99,"location":2,"content":"our vector space models so we're still"},{"from":1132.99,"to":1134.37,"location":2,"content":"going to stick with a vector space"},{"from":1134.37,"to":1136.8,"location":2,"content":"semantics of phrases and we want to"},{"from":1136.8,"to":1140.1,"location":2,"content":"compute the meanings of phrases and so"},{"from":1140.1,"to":1142.86,"location":2,"content":"then the question is how could we go"},{"from":1142.86,"to":1146.01,"location":2,"content":"about doing that and we'll answer number"},{"from":1146.01,"to":1147.84,"location":2,"content":"one is we kind of use the principle of"},{"from":1147.84,"to":1150.72,"location":2,"content":"compositionality since we're sure it's"},{"from":1150.72,"to":1152.82,"location":2,"content":"right and so well what the principle of"},{"from":1152.82,"to":1156.03,"location":2,"content":"compositionality essentially says if you"},{"from":1156.03,"to":1158.79,"location":2,"content":"want to work out the meaning well here"},{"from":1158.79,"to":1160.92,"location":2,"content":"it says of a sentence but the meaning of"},{"from":1160.92,"to":1163.89,"location":2,"content":"any phrase any constituent is you're"},{"from":1163.89,"to":1166.2,"location":2,"content":"going to build it by knowing the"},{"from":1166.2,"to":1169.62,"location":2,"content":"meanings of its words and then having"},{"from":1169.62,"to":1171.72,"location":2,"content":"rules that combine these meanings so"},{"from":1171.72,"to":1173.52,"location":2,"content":"starting off with the country of my"},{"from":1173.52,"to":1175.62,"location":2,"content":"birth I should be able to calculate a"},{"from":1175.62,"to":1178.02,"location":2,"content":"meaning of my birth and meaning of the"},{"from":1178.02,"to":1181.23,"location":2,"content":"country a meaning of of the my birth and"},{"from":1181.23,"to":1183.06,"location":2,"content":"then a meaning of the country of my"},{"from":1183.06,"to":1185.46,"location":2,"content":"birth so we'd have meaning composition"},{"from":1185.46,"to":1187.29,"location":2,"content":"rules which will let us calculate"},{"from":1187.29,"to":1190.89,"location":2,"content":"meanings upwards for larger constituents"},{"from":1190.89,"to":1195.87,"location":2,"content":"or sentences so that seems kind of the"},{"from":1195.87,"to":1197.85,"location":2,"content":"right thing to do and so then the"},{"from":1197.85,"to":1201.95,"location":2,"content":"question is well can we then build a"},{"from":1201.95,"to":1204.24,"location":2,"content":"model of how to do that"},{"from":1204.24,"to":1205.77,"location":2,"content":"well here's sort of a straightforward"},{"from":1205.77,"to":1210.27,"location":2,"content":"way of doing this okay so we"},{"from":1210.27,"to":1214.68,"location":2,"content":"we have word vectors for the words that"},{"from":1214.68,"to":1216.99,"location":2,"content":"we've calculated and what we'd like to"},{"from":1216.99,"to":1221.55,"location":2,"content":"do is work out then a meaning"},{"from":1221.55,"to":1223.89,"location":2,"content":"representation of this sentence and at"},{"from":1223.89,"to":1226.02,"location":2,"content":"this point we sort of have two things to"},{"from":1226.02,"to":1229.2,"location":2,"content":"do we have parsing to do where here"},{"from":1229.2,"to":1230.7,"location":2,"content":"what's the right structure of the"},{"from":1230.7,"to":1233.13,"location":2,"content":"sentence and then we have meaning"},{"from":1233.13,"to":1236.76,"location":2,"content":"computation to do of working out what is"},{"from":1236.76,"to":1238.47,"location":2,"content":"the meaning representation of this"},{"from":1238.47,"to":1242.19,"location":2,"content":"sentence so for parsing we'd sort of be"},{"from":1242.19,"to":1243.93,"location":2,"content":"building sort of noun phrase"},{"from":1243.93,"to":1245.76,"location":2,"content":"prepositional phrase verb phrase"},{"from":1245.76,"to":1248.7,"location":2,"content":"sentence kind of units to get the cat"},{"from":1248.7,"to":1251.73,"location":2,"content":"sat on the mat and then we'll what we if"},{"from":1251.73,"to":1254.79,"location":2,"content":"we had that we could then run some kind"},{"from":1254.79,"to":1257.49,"location":2,"content":"of meaning computation program and give"},{"from":1257.49,"to":1259.98,"location":2,"content":"us sort of a vector space meaning of"},{"from":1259.98,"to":1262.44,"location":2,"content":"these sentences so that's kind of what"},{"from":1262.44,"to":1264.81,"location":2,"content":"we want is to do both of those and in a"},{"from":1264.81,"to":1266.88,"location":2,"content":"little bit I'll show you an example of"},{"from":1266.88,"to":1269.13,"location":2,"content":"the kind of one way that you go about"},{"from":1269.13,"to":1271.5,"location":2,"content":"approaching that but before I do that"},{"from":1271.5,"to":1273.51,"location":2,"content":"just sort of stepping back for a moment"},{"from":1273.51,"to":1276.12,"location":2,"content":"as to what's different here write that"},{"from":1276.12,"to":1280.05,"location":2,"content":"here we had our recurrent neural network"},{"from":1280.05,"to":1281.79,"location":2,"content":"which in some sense has been our"},{"from":1281.79,"to":1284.64,"location":2,"content":"workhorse tool in this class up to now"},{"from":1284.64,"to":1287,"location":2,"content":"and it gives you it gives you a"},{"from":1287,"to":1288.9,"location":2,"content":"representation of the meaning of the"},{"from":1288.9,"to":1290.88,"location":2,"content":"country of my birth sort of you could"},{"from":1290.88,"to":1293.34,"location":2,"content":"either say that's the meaning of the"},{"from":1293.34,"to":1295.35,"location":2,"content":"country of my birth or we talked about"},{"from":1295.35,"to":1298.2,"location":2,"content":"other tricks like doing max pooling"},{"from":1298.2,"to":1300.75,"location":2,"content":"across all of these or you could have a"},{"from":1300.75,"to":1303.27,"location":2,"content":"separate node out here which sort of"},{"from":1303.27,"to":1304.95,"location":2,"content":"does attention over the ease so it does"},{"from":1304.95,"to":1309.48,"location":2,"content":"give you a sort of representation of the"},{"from":1309.48,"to":1313.38,"location":2,"content":"meaning of this of any subsequence of"},{"from":1313.38,"to":1316.98,"location":2,"content":"words as well but they sort of different"},{"from":1316.98,"to":1319.62,"location":2,"content":"right that this what the top the tree"},{"from":1319.62,"to":1323.01,"location":2,"content":"recursive neural network it requires a"},{"from":1323.01,"to":1326.67,"location":2,"content":"sentence or any kind of phrase to have a"},{"from":1326.67,"to":1328.68,"location":2,"content":"tree structure so we know what its"},{"from":1328.68,"to":1331.47,"location":2,"content":"component parts are but then we're"},{"from":1331.47,"to":1335.43,"location":2,"content":"working out meaning representations for"},{"from":1335.43,"to":1339.15,"location":2,"content":"the phrase that are sensitive to what"},{"from":1339.15,"to":1341.1,"location":2,"content":"its syntactic structure is that how the"},{"from":1341.1,"to":1344.2,"location":2,"content":"words go together to build phrases"},{"from":1344.2,"to":1346.69,"location":2,"content":"whereas for there were current neural"},{"from":1346.69,"to":1349.24,"location":2,"content":"network we're just in an oblivious way"},{"from":1349.24,"to":1351.49,"location":2,"content":"running a sequence model along and"},{"from":1351.49,"to":1354.97,"location":2,"content":"saying compute things and in the obvious"},{"from":1354.97,"to":1356.89,"location":2,"content":"it doesn't in any obvious way give a"},{"from":1356.89,"to":1359.86,"location":2,"content":"meaning representation of my birth or my"},{"from":1359.86,"to":1362.53,"location":2,"content":"birth contained inside it we sort of"},{"from":1362.53,"to":1364.87,"location":2,"content":"only have a meaning representation for"},{"from":1364.87,"to":1366.79,"location":2,"content":"the whole sequence whereas if we're"},{"from":1366.79,"to":1369.81,"location":2,"content":"doing things this way we do have meaning"},{"from":1369.81,"to":1371.83,"location":2,"content":"representations for the different"},{"from":1371.83,"to":1375.85,"location":2,"content":"meaningful parts of the sentence okay"},{"from":1375.85,"to":1377.32,"location":2,"content":"that makes sense of what we're trying to"},{"from":1377.32,"to":1381.91,"location":2,"content":"do okay so how could we do it go about"},{"from":1381.91,"to":1386.71,"location":2,"content":"doing that well the idea of how we could"},{"from":1386.71,"to":1389.02,"location":2,"content":"go about doing that is if we work"},{"from":1389.02,"to":1391.86,"location":2,"content":"bottom-up at the very bottom we have"},{"from":1391.86,"to":1395.68,"location":2,"content":"word vectors and so we want to"},{"from":1395.68,"to":1398.26,"location":2,"content":"recursively compute the meaning of"},{"from":1398.26,"to":1400.81,"location":2,"content":"bigger constituents so if we wanted to"},{"from":1400.81,"to":1403.3,"location":2,"content":"compute the meaning of on the mat what"},{"from":1403.3,"to":1406.42,"location":2,"content":"we can do is say well we have already"},{"from":1406.42,"to":1409.36,"location":2,"content":"have a meaning representation of on and"},{"from":1409.36,"to":1412.09,"location":2,"content":"mat so if we could feed those into a"},{"from":1412.09,"to":1414.49,"location":2,"content":"neural network because that's one tool"},{"from":1414.49,"to":1417.07,"location":2,"content":"we could maybe get out of it two things"},{"from":1417.07,"to":1421.51,"location":2,"content":"we could get out of it a goodness score"},{"from":1421.51,"to":1423.49,"location":2,"content":"so this is what we're going to use for"},{"from":1423.49,"to":1425.89,"location":2,"content":"parsing we're going to say do you"},{"from":1425.89,"to":1428.29,"location":2,"content":"believe do you believe you can put"},{"from":1428.29,"to":1431.86,"location":2,"content":"together on and the mat to form a good"},{"from":1431.86,"to":1434.32,"location":2,"content":"constituent that's part of a parse tree"},{"from":1434.32,"to":1437.02,"location":2,"content":"and this will be a big positive number"},{"from":1437.02,"to":1438.61,"location":2,"content":"if the answer is true and negative if"},{"from":1438.61,"to":1441.1,"location":2,"content":"that's not true and then we have a"},{"from":1441.1,"to":1443.77,"location":2,"content":"meaning composition device which says"},{"from":1443.77,"to":1446.68,"location":2,"content":"okay if you put together these two"},{"from":1446.68,"to":1448.44,"location":2,"content":"things what would be the meaning"},{"from":1448.44,"to":1451.59,"location":2,"content":"representation of what we put together"},{"from":1451.59,"to":1454.81,"location":2,"content":"and so this is the first model that we"},{"from":1454.81,"to":1457.09,"location":2,"content":"explored which was doing this in a"},{"from":1457.09,"to":1460.57,"location":2,"content":"pretty simple way right so here was our"},{"from":1460.57,"to":1464.1,"location":2,"content":"meaning composition device that we"},{"from":1464.1,"to":1466.18,"location":2,"content":"concatenated the two vectors of the"},{"from":1466.18,"to":1469.39,"location":2,"content":"constituents we multiply them by matrix"},{"from":1469.39,"to":1472.12,"location":2,"content":"atoms I'm a biased as usual put it"},{"from":1472.12,"to":1475.03,"location":2,"content":"through at NH this work is old enough"},{"from":1475.03,"to":1476.95,"location":2,"content":"it's sort of before things like values"},{"from":1476.95,"to":1477.82,"location":2,"content":"became pop"},{"from":1477.82,"to":1479.32,"location":2,"content":"pila but maybe it's better to have at an"},{"from":1479.32,"to":1482.29,"location":2,"content":"age anyway but more like recurrent"},{"from":1482.29,"to":1484.12,"location":2,"content":"neural network and so this was our"},{"from":1484.12,"to":1485.77,"location":2,"content":"meaning composition that gave the"},{"from":1485.77,"to":1488.2,"location":2,"content":"meaning of the parent and then to the"},{"from":1488.2,"to":1490.21,"location":2,"content":"side what the score if it was as to"},{"from":1490.21,"to":1492.61,"location":2,"content":"whether this was a good phrase we were"},{"from":1492.61,"to":1494.98,"location":2,"content":"taking that parent vector representation"},{"from":1494.98,"to":1498.91,"location":2,"content":"and multiplying it by another vector and"},{"from":1498.91,"to":1503.32,"location":2,"content":"that was giving us out a number if you"},{"from":1503.32,"to":1505.66,"location":2,"content":"think about it a bit while we're doing"},{"from":1505.66,"to":1507.7,"location":2,"content":"this you might think that this isn't"},{"from":1507.7,"to":1509.98,"location":2,"content":"quite a perfect model of meaning"},{"from":1509.98,"to":1511.6,"location":2,"content":"composition and later on in the class"},{"from":1511.6,"to":1514.33,"location":2,"content":"I'll talk about some more complex models"},{"from":1514.33,"to":1519.22,"location":2,"content":"that we then started to explore that"},{"from":1519.22,"to":1521.5,"location":2,"content":"this is sort of enough to get us going"},{"from":1521.5,"to":1524.44,"location":2,"content":"and this gave us a way of building a"},{"from":1524.44,"to":1527.26,"location":2,"content":"recursive neural network parser which"},{"from":1527.26,"to":1531.34,"location":2,"content":"both found parsers and worked out a"},{"from":1531.34,"to":1534.04,"location":2,"content":"meaning representation for them and so"},{"from":1534.04,"to":1536.26,"location":2,"content":"the way we did this was in the simplest"},{"from":1536.26,"to":1538.24,"location":2,"content":"possible way really which was to have a"},{"from":1538.24,"to":1539.62,"location":2,"content":"greedy parser"},{"from":1539.62,"to":1541.78,"location":2,"content":"so if we start off with the cat sat on"},{"from":1541.78,"to":1544.09,"location":2,"content":"the mat what we could do is say well"},{"from":1544.09,"to":1546.04,"location":2,"content":"maybe you should join that and cat"},{"from":1546.04,"to":1548.59,"location":2,"content":"together let's try that run it through"},{"from":1548.59,"to":1551.26,"location":2,"content":"our neural network it'll get a score and"},{"from":1551.26,"to":1553.81,"location":2,"content":"a meaning representation and while we"},{"from":1553.81,"to":1556.57,"location":2,"content":"could try doing that for cat and Sat we"},{"from":1556.57,"to":1559.03,"location":2,"content":"could try doing it for satin on we could"},{"from":1559.03,"to":1561.4,"location":2,"content":"try doing it for Anand our we could try"},{"from":1561.4,"to":1563.92,"location":2,"content":"doing it for that and mat and then at"},{"from":1563.92,"to":1566.83,"location":2,"content":"this point we'd say okay well the the"},{"from":1566.83,"to":1569.47,"location":2,"content":"best phrase that we can make combining"},{"from":1569.47,"to":1571.96,"location":2,"content":"these word vectors is the one for that"},{"from":1571.96,"to":1574.69,"location":2,"content":"cat so let's just commit to that one and"},{"from":1574.69,"to":1577.27,"location":2,"content":"it has this semantic representation and"},{"from":1577.27,"to":1580.45,"location":2,"content":"at this point we can essentially repeat"},{"from":1580.45,"to":1582.58,"location":2,"content":"now all the work we did over there we"},{"from":1582.58,"to":1584.23,"location":2,"content":"can just reuse because nothing has"},{"from":1584.23,"to":1587.2,"location":2,"content":"changed but we can also consider now"},{"from":1587.2,"to":1590.44,"location":2,"content":"joining the cat as a constituent with"},{"from":1590.44,"to":1593.44,"location":2,"content":"Sat and get a score for that and so at"},{"from":1593.44,"to":1595.66,"location":2,"content":"this point we decide okay the mat is the"},{"from":1595.66,"to":1598.02,"location":2,"content":"best constituent to build commit to that"},{"from":1598.02,"to":1600.43,"location":2,"content":"calculate a meaning representation for"},{"from":1600.43,"to":1603.25,"location":2,"content":"on the mat that looks good commit to"},{"from":1603.25,"to":1606.04,"location":2,"content":"that and kind of keep on chugging up and"},{"from":1606.04,"to":1608.53,"location":2,"content":"so we've got a mechanism for sort of"},{"from":1608.53,"to":1611.38,"location":2,"content":"choosing a pars of the sentence in a in"},{"from":1611.38,"to":1611.66,"location":2,"content":"a"},{"from":1611.66,"to":1613.4,"location":2,"content":"really manner but you know when we"},{"from":1613.4,"to":1615.38,"location":2,"content":"looked at the dependency parsing we're"},{"from":1615.38,"to":1617.66,"location":2,"content":"also doing that greedily right and"},{"from":1617.66,"to":1620.83,"location":2,"content":"coming up with a meaning representation"},{"from":1620.83,"to":1623.87,"location":2,"content":"okay so that was our first model of"},{"from":1623.87,"to":1625.76,"location":2,"content":"having a tree recursive neural network"},{"from":1625.76,"to":1629.51,"location":2,"content":"and using it for parsing there are a few"},{"from":1629.51,"to":1634.19,"location":2,"content":"more details here some of which probably"},{"from":1634.19,"to":1637.52,"location":2,"content":"aren't super important at this point"},{"from":1637.52,"to":1640.13,"location":2,"content":"right so we could score a tree by"},{"from":1640.13,"to":1643.82,"location":2,"content":"summing the scores at each node for"},{"from":1643.82,"to":1647,"location":2,"content":"working out for the optimization we were"},{"from":1647,"to":1648.98,"location":2,"content":"working out we're using this kind of Max"},{"from":1648.98,"to":1651.53,"location":2,"content":"margin loss that we've looked at in"},{"from":1651.53,"to":1655.76,"location":2,"content":"other places the simplest way to do"},{"from":1655.76,"to":1658.84,"location":2,"content":"things is completely greedily you just"},{"from":1658.84,"to":1661.7,"location":2,"content":"find the best local decision each point"},{"from":1661.7,"to":1663.26,"location":2,"content":"and make that structure and keep on"},{"from":1663.26,"to":1665.12,"location":2,"content":"going but if you want to do things a bit"},{"from":1665.12,"to":1668.53,"location":2,"content":"better and we explore this you could say"},{"from":1668.53,"to":1671.66,"location":2,"content":"we could do beam search we could explore"},{"from":1671.66,"to":1674.33,"location":2,"content":"out several good ways emerging and then"},{"from":1674.33,"to":1676.79,"location":2,"content":"the side later higher up the tree as to"},{"from":1676.79,"to":1680.39,"location":2,"content":"which was the best way to merge we"},{"from":1680.39,"to":1682.37,"location":2,"content":"haven't talked about it in this class"},{"from":1682.37,"to":1686.36,"location":2,"content":"but just to mention something in case"},{"from":1686.36,"to":1689.12,"location":2,"content":"people have seen it is traditional"},{"from":1689.12,"to":1691.43,"location":2,"content":"constituency parsing where you have"},{"from":1691.43,"to":1695.54,"location":2,"content":"symbols here like MP or VP there exist"},{"from":1695.54,"to":1698.57,"location":2,"content":"efficient dynamic programming algorithms"},{"from":1698.57,"to":1702.23,"location":2,"content":"where you can find the optimal pars of a"},{"from":1702.23,"to":1704.45,"location":2,"content":"sentence in polynomial time"},{"from":1704.45,"to":1706.7,"location":2,"content":"so in cubic time so if you have a"},{"from":1706.7,"to":1709.49,"location":2,"content":"regular context-free grammar and well so"},{"from":1709.49,"to":1711.23,"location":2,"content":"a regular probabilistic context-free"},{"from":1711.23,"to":1714.17,"location":2,"content":"grammar and you want to know what is the"},{"from":1714.17,"to":1716.06,"location":2,"content":"best part of the sentence according to"},{"from":1716.06,"to":1717.98,"location":2,"content":"the probabilistic context-free grammar"},{"from":1717.98,"to":1720.29,"location":2,"content":"you can write a cubic time dynamic"},{"from":1720.29,"to":1722.69,"location":2,"content":"programming algorithm and you can find"},{"from":1722.69,"to":1725.54,"location":2,"content":"it that's good and in the old days of"},{"from":1725.54,"to":1730.19,"location":2,"content":"cs2 24in before neural networks we used"},{"from":1730.19,"to":1732.29,"location":2,"content":"to have everyone do that the the most"},{"from":1732.29,"to":1735.77,"location":2,"content":"the most brain breaking assignment of"},{"from":1735.77,"to":1738.59,"location":2,"content":"the old cs2 24n was writing this dynamic"},{"from":1738.59,"to":1740.63,"location":2,"content":"program to do context-free grammar"},{"from":1740.63,"to":1744.32,"location":2,"content":"parsing of a sentence the slightly sad"},{"from":1744.32,"to":1745.22,"location":2,"content":"fact"},{"from":1745.22,"to":1747.05,"location":2,"content":"is once you go to these kind of neural"},{"from":1747.05,"to":1749.78,"location":2,"content":"network representations you can't write"},{"from":1749.78,"to":1752.09,"location":2,"content":"clever dynamic programming algorithms"},{"from":1752.09,"to":1754.34,"location":2,"content":"anymore because clever dynamic"},{"from":1754.34,"to":1757.01,"location":2,"content":"programming algorithms only work when"},{"from":1757.01,"to":1759.32,"location":2,"content":"you have symbols from a reasonably small"},{"from":1759.32,"to":1761.99,"location":2,"content":"set for your non terminals because if"},{"from":1761.99,"to":1764.9,"location":2,"content":"that's the case you can you kind of have"},{"from":1764.9,"to":1767.12,"location":2,"content":"collisions right you have lots of ways"},{"from":1767.12,"to":1768.98,"location":2,"content":"of parsing stuff lower down"},{"from":1768.98,"to":1771.86,"location":2,"content":"which come up turn out to be different"},{"from":1771.86,"to":1773.87,"location":2,"content":"ways to make a noun phrase or different"},{"from":1773.87,"to":1775.79,"location":2,"content":"ways to make a prepositional phrase and"},{"from":1775.79,"to":1777.68,"location":2,"content":"therefore you can save work with dynamic"},{"from":1777.68,"to":1779.87,"location":2,"content":"programming if you've got a model like"},{"from":1779.87,"to":1782.48,"location":2,"content":"this since everything that you build is"},{"from":1782.48,"to":1784.31,"location":2,"content":"going through layers of neural network"},{"from":1784.31,"to":1786.05,"location":2,"content":"and you've got a meaning representation"},{"from":1786.05,"to":1788.15,"location":2,"content":"of some high dimensional vector things"},{"from":1788.15,"to":1790.19,"location":2,"content":"are never going to collide and so you"},{"from":1790.19,"to":1791.9,"location":2,"content":"can never save work by doing dynamic"},{"from":1791.9,"to":1795.53,"location":2,"content":"programming and so you're either doing"},{"from":1795.53,"to":1797.57,"location":2,"content":"exponential work to explore out"},{"from":1797.57,"to":1799.73,"location":2,"content":"everything or else you're using some"},{"from":1799.73,"to":1801.86,"location":2,"content":"kind of beam to explore a bunch of"},{"from":1801.86,"to":1807.98,"location":2,"content":"likely stuff yeah we actually also"},{"from":1807.98,"to":1811.43,"location":2,"content":"applied this to vision at the same time"},{"from":1811.43,"to":1814.67,"location":2,"content":"so it wasn't just so completely a vague"},{"from":1814.67,"to":1818.78,"location":2,"content":"motivation of visual scenes have parts"},{"from":1818.78,"to":1821.15,"location":2,"content":"that we actually started exploring that"},{"from":1821.15,"to":1824.48,"location":2,"content":"well you could take these pieces of"},{"from":1824.48,"to":1828.5,"location":2,"content":"scenes and then work out representations"},{"from":1828.5,"to":1831.13,"location":2,"content":"for scenes using a similar form of"},{"from":1831.13,"to":1836.05,"location":2,"content":"compositionality and so in particular"},{"from":1836.05,"to":1839.24,"location":2,"content":"there was sort of this data set that was"},{"from":1839.24,"to":1842.57,"location":2,"content":"being used for multi-class segmentation"},{"from":1842.57,"to":1845.39,"location":2,"content":"envision where you start off with very"},{"from":1845.39,"to":1847.64,"location":2,"content":"small patches and then you want to"},{"from":1847.64,"to":1850.43,"location":2,"content":"combine them out into parts of a scene"},{"from":1850.43,"to":1852.53,"location":2,"content":"of sort of recognizing which part of the"},{"from":1852.53,"to":1855.32,"location":2,"content":"picture was the building the sky the"},{"from":1855.32,"to":1858.35,"location":2,"content":"road various other classes and we were"},{"from":1858.35,"to":1860.54,"location":2,"content":"actually at the time able to do this"},{"from":1860.54,"to":1863.54,"location":2,"content":"really rather well using one of these"},{"from":1863.54,"to":1865.22,"location":2,"content":"tree recursive structured neural"},{"from":1865.22,"to":1867.5,"location":2,"content":"networks better than preceding work and"},{"from":1867.5,"to":1871.66,"location":2,"content":"vision had done in the late 2000s decade"},{"from":1871.66,"to":1876.02,"location":2,"content":"okay so how can how can we build neural"},{"from":1876.02,"to":1879.26,"location":2,"content":"networks that do this kind of stuff"},{"from":1879.26,"to":1882.47,"location":2,"content":"and so when when we started off"},{"from":1882.47,"to":1884.6,"location":2,"content":"exploring this tree structured neural"},{"from":1884.6,"to":1887.42,"location":2,"content":"networks we thought that this was a cool"},{"from":1887.42,"to":1890.09,"location":2,"content":"original idea and no one had worked on"},{"from":1890.09,"to":1891.44,"location":2,"content":"tree structured neural networks"},{"from":1891.44,"to":1894.32,"location":2,"content":"successfully before but it turned out we"},{"from":1894.32,"to":1896.24,"location":2,"content":"were wrong that there are a couple of"},{"from":1896.24,"to":1900.25,"location":2,"content":"Germans in the mid-1990s who actually"},{"from":1900.25,"to":1902.27,"location":2,"content":"started looking at tree structured"},{"from":1902.27,"to":1905.15,"location":2,"content":"neural networks and had worked out the"},{"from":1905.15,"to":1907.13,"location":2,"content":"math of them so corresponding to the"},{"from":1907.13,"to":1909.23,"location":2,"content":"back propagation through time algorithm"},{"from":1909.23,"to":1911.36,"location":2,"content":"that ABI talked about when we were doing"},{"from":1911.36,"to":1913.34,"location":2,"content":"recurrent neural networks they worked"},{"from":1913.34,"to":1915.26,"location":2,"content":"out the tree structured case which they"},{"from":1915.26,"to":1917.57,"location":2,"content":"called back propagation through"},{"from":1917.57,"to":1921.83,"location":2,"content":"structure there are several slides on"},{"from":1921.83,"to":1925.01,"location":2,"content":"this in these slides but I think I'm"},{"from":1925.01,"to":1928.1,"location":2,"content":"gonna sort of skip them if anyone wants"},{"from":1928.1,"to":1929.66,"location":2,"content":"to look at them there on the web and you"},{"from":1929.66,"to":1931.88,"location":2,"content":"can look at them I mean there isn't"},{"from":1931.88,"to":1935.18,"location":2,"content":"actually anything that's new so if you"},{"from":1935.18,"to":1938.57,"location":2,"content":"remember with with bad scarring or"},{"from":1938.57,"to":1940.31,"location":2,"content":"something those early lectures of this"},{"from":1940.31,"to":1942.95,"location":2,"content":"class of working out the derivatives of"},{"from":1942.95,"to":1944.81,"location":2,"content":"neural networks and how it worked with"},{"from":1944.81,"to":1947.12,"location":2,"content":"recurrent neural networks it's sort of"},{"from":1947.12,"to":1949.25,"location":2,"content":"the same right you gave this recurrent"},{"from":1949.25,"to":1951.59,"location":2,"content":"matrix had different levels of tree"},{"from":1951.59,"to":1954.05,"location":2,"content":"structure you're summing the derivatives"},{"from":1954.05,"to":1957.62,"location":2,"content":"of everywhere it turns up the only"},{"from":1957.62,"to":1959,"location":2,"content":"difference is sort of because we now"},{"from":1959,"to":1960.86,"location":2,"content":"have tree structure you're sort of"},{"from":1960.86,"to":1965.51,"location":2,"content":"splitting things downwards so yeah so"},{"from":1965.51,"to":1968.03,"location":2,"content":"for drop we find the computer forwards"},{"from":1968.03,"to":1971.48,"location":2,"content":"and then when we're doing back crop when"},{"from":1971.48,"to":1973.58,"location":2,"content":"we've have the backward propagation we"},{"from":1973.58,"to":1975.32,"location":2,"content":"have the error signal coming from above"},{"from":1975.32,"to":1979.67,"location":2,"content":"we then combine it with the calculations"},{"from":1979.67,"to":1981.02,"location":2,"content":"that this node and then we're sort of"},{"from":1981.02,"to":1983.51,"location":2,"content":"sending it back in a tree structure down"},{"from":1983.51,"to":1987.47,"location":2,"content":"to each of the branches underneath us so"},{"from":1987.47,"to":1990.53,"location":2,"content":"that was our first version of things you"},{"from":1990.53,"to":1992.15,"location":2,"content":"know we got some decent result we got"},{"from":1992.15,"to":1994.01,"location":2,"content":"this good vision results and I showed"},{"from":1994.01,"to":1998.15,"location":2,"content":"you and it sort of seemed to do some"},{"from":1998.15,"to":2001.81,"location":2,"content":"good for language both for parsing and"},{"from":2001.81,"to":2003.97,"location":2,"content":"doing we had some results I haven't"},{"from":2003.97,"to":2006.01,"location":2,"content":"actually included here of sort of doing"},{"from":2006.01,"to":2009.37,"location":2,"content":"paraphrase judgment between sentences"},{"from":2009.37,"to":2012.7,"location":2,"content":"and and model things fail"},{"from":2012.7,"to":2015.91,"location":2,"content":"well but once we started thinking about"},{"from":2015.91,"to":2018.34,"location":2,"content":"it more it seemed like that very simple"},{"from":2018.34,"to":2021.61,"location":2,"content":"neural net function couldn't possibly"},{"from":2021.61,"to":2023.71,"location":2,"content":"compute the kind of meanings that we"},{"from":2023.71,"to":2026.47,"location":2,"content":"wanted to compute for sentence meanings"},{"from":2026.47,"to":2029.14,"location":2,"content":"and so we then sort of said about trying"},{"from":2029.14,"to":2030.88,"location":2,"content":"to come up with some more complex ways"},{"from":2030.88,"to":2033.94,"location":2,"content":"of working out kind of meaning"},{"from":2033.94,"to":2036.16,"location":2,"content":"composition functions and modes that"},{"from":2036.16,"to":2038.14,"location":2,"content":"could then be used to build a better"},{"from":2038.14,"to":2041.02,"location":2,"content":"neural network and sort of sudden some"},{"from":2041.02,"to":2044.11,"location":2,"content":"of the essence of that is on this slide"},{"from":2044.11,"to":2046.93,"location":2,"content":"that you know for the first version we"},{"from":2046.93,"to":2048.58,"location":2,"content":"just didn't have enough complexity of"},{"from":2048.58,"to":2050.8,"location":2,"content":"neural network frankly right so when we"},{"from":2050.8,"to":2053.65,"location":2,"content":"had two constituents we concatenated"},{"from":2053.65,"to":2056.55,"location":2,"content":"them and multiply that by a weight"},{"from":2056.55,"to":2060.01,"location":2,"content":"weight matrix and that was sort of"},{"from":2060.01,"to":2064.36,"location":2,"content":"essentially all we had and as I hope"},{"from":2064.36,"to":2067.15,"location":2,"content":"you've gone more of a sense of in this"},{"from":2067.15,"to":2069.13,"location":2,"content":"class if you just concatenate and"},{"from":2069.13,"to":2071.98,"location":2,"content":"multiply by weight matrix you're not"},{"from":2071.98,"to":2073.99,"location":2,"content":"actually modeling the interaction"},{"from":2073.99,"to":2076.18,"location":2,"content":"between these two vectors right because"},{"from":2076.18,"to":2078.49,"location":2,"content":"you can think of this weight matrix is"},{"from":2078.49,"to":2080.98,"location":2,"content":"just sort of being divided in two and"},{"from":2080.98,"to":2083.44,"location":2,"content":"half of it multiplies that this vector"},{"from":2083.44,"to":2085.81,"location":2,"content":"and half of it multiplies this vector so"},{"from":2085.81,"to":2088.18,"location":2,"content":"the meanings of these two things don't"},{"from":2088.18,"to":2090.61,"location":2,"content":"act on each other and so somehow you"},{"from":2090.61,"to":2092.95,"location":2,"content":"have to make your neural network more"},{"from":2092.95,"to":2095.56,"location":2,"content":"complex than that but the other way in"},{"from":2095.56,"to":2098.65,"location":2,"content":"which this seemed too simple is in the"},{"from":2098.65,"to":2101.92,"location":2,"content":"first model we had just one weight"},{"from":2101.92,"to":2105.27,"location":2,"content":"matrix which we use for everything and"},{"from":2105.27,"to":2108.07,"location":2,"content":"at least if you're a linguist and you're"},{"from":2108.07,"to":2109.39,"location":2,"content":"thinking about the structure of language"},{"from":2109.39,"to":2112.21,"location":2,"content":"you might start thinking of well wait a"},{"from":2112.21,"to":2114.28,"location":2,"content":"minute sometimes you're going to be"},{"from":2114.28,"to":2115.75,"location":2,"content":"putting together a verb and an object"},{"from":2115.75,"to":2120.01,"location":2,"content":"noun phrase hit the ball sometimes"},{"from":2120.01,"to":2121.39,"location":2,"content":"you're going to be putting together an"},{"from":2121.39,"to":2124.42,"location":2,"content":"article and a noun our ball sometimes"},{"from":2124.42,"to":2126.84,"location":2,"content":"you're going to be doing adjective or"},{"from":2126.84,"to":2130.03,"location":2,"content":"modification blue ball these things are"},{"from":2130.03,"to":2133.06,"location":2,"content":"very different in their semantics can it"},{"from":2133.06,"to":2135.16,"location":2,"content":"really be the case that you can just"},{"from":2135.16,"to":2137.05,"location":2,"content":"have one weight matrix that is this"},{"from":2137.05,"to":2139.51,"location":2,"content":"universal composition function for"},{"from":2139.51,"to":2141.31,"location":2,"content":"putting together meaning of phrases"},{"from":2141.31,"to":2143.83,"location":2,"content":"could that possibly work and you sort of"},{"from":2143.83,"to":2146.64,"location":2,"content":"might suspect it doesn't work"},{"from":2146.64,"to":2151.33,"location":2,"content":"and so I'm going to go on and show some"},{"from":2151.33,"to":2153.64,"location":2,"content":"of those different things but really"},{"from":2153.64,"to":2158.44,"location":2,"content":"before I show the different things I'm"},{"from":2158.44,"to":2160.84,"location":2,"content":"going to show one more version that sort"},{"from":2160.84,"to":2163.03,"location":2,"content":"of related to the first thing which"},{"from":2163.03,"to":2165.88,"location":2,"content":"actually gave a pretty successful and"},{"from":2165.88,"to":2170.83,"location":2,"content":"good parser or edit for doing context"},{"from":2170.83,"to":2175.06,"location":2,"content":"free style constituency parsing and so"},{"from":2175.06,"to":2177.85,"location":2,"content":"this was another way of getting away"},{"from":2177.85,"to":2181.68,"location":2,"content":"from the parsing being completely greedy"},{"from":2181.68,"to":2185.38,"location":2,"content":"which was to actually split apart the"},{"from":2185.38,"to":2188.2,"location":2,"content":"two parts of gee we have to come up with"},{"from":2188.2,"to":2191.65,"location":2,"content":"a tree structure for our sentence from"},{"from":2191.65,"to":2193.78,"location":2,"content":"let's compute the meaning of the"},{"from":2193.78,"to":2197.38,"location":2,"content":"sentence and so the thinking was well in"},{"from":2197.38,"to":2201.01,"location":2,"content":"terms of deciding what's a good tree"},{"from":2201.01,"to":2203.62,"location":2,"content":"structure for a sentence that's actually"},{"from":2203.62,"to":2205.69,"location":2,"content":"something you can do pretty well with a"},{"from":2205.69,"to":2208.12,"location":2,"content":"symbolic grammar that the problems with"},{"from":2208.12,"to":2210.85,"location":2,"content":"symbolic grammars aren't that they can't"},{"from":2210.85,"to":2213.43,"location":2,"content":"put tree structures over sentences the"},{"from":2213.43,"to":2215.29,"location":2,"content":"problems you have with those grammars is"},{"from":2215.29,"to":2217.12,"location":2,"content":"that they can't compute meaning"},{"from":2217.12,"to":2219.97,"location":2,"content":"representation and they're not very good"},{"from":2219.97,"to":2222.61,"location":2,"content":"at choosing between alternative tree"},{"from":2222.61,"to":2225.58,"location":2,"content":"structures but we could divide up the"},{"from":2225.58,"to":2228.28,"location":2,"content":"two parts so what we could do is say"},{"from":2228.28,"to":2230.34,"location":2,"content":"well let's just use a regular"},{"from":2230.34,"to":2233.35,"location":2,"content":"probabilistic context-free grammar to"},{"from":2233.35,"to":2235.39,"location":2,"content":"generate possible tree structures for"},{"from":2235.39,"to":2237.94,"location":2,"content":"sentences we can generate a key best"},{"from":2237.94,"to":2240.75,"location":2,"content":"list and say what are the 50 best"},{"from":2240.75,"to":2243.55,"location":2,"content":"context-free grammar structures for this"},{"from":2243.55,"to":2245.23,"location":2,"content":"sentence and that's something we can do"},{"from":2245.23,"to":2247.09,"location":2,"content":"very efficiently with dynamic"},{"from":2247.09,"to":2250.44,"location":2,"content":"programming algorithms and then we can"},{"from":2250.44,"to":2255.28,"location":2,"content":"work out a neural net that will work out"},{"from":2255.28,"to":2256.9,"location":2,"content":"the meaning representation of the"},{"from":2256.9,"to":2262.09,"location":2,"content":"sentence and so that led to this what's"},{"from":2262.09,"to":2265.27,"location":2,"content":"called syntactically untied recursive"},{"from":2265.27,"to":2268.93,"location":2,"content":"neural network so essentially what this"},{"from":2268.93,"to":2272.56,"location":2,"content":"is saying is that we had for each node"},{"from":2272.56,"to":2276.43,"location":2,"content":"in the sentence it's got a category of a"},{"from":2276.43,"to":2279.31,"location":2,"content":"symbolic context-free grammar so there"},{"from":2279.31,"to":2279.89,"location":2,"content":"category"},{"from":2279.89,"to":2283.31,"location":2,"content":"and B and C so when we put things"},{"from":2283.31,"to":2286.7,"location":2,"content":"together we'll be able to say okay we've"},{"from":2286.7,"to":2292.79,"location":2,"content":"got a rule that says X goes to BC so"},{"from":2292.79,"to":2295.94,"location":2,"content":"that licenses this node here so that"},{"from":2295.94,"to":2299.3,"location":2,"content":"part of the parsing is symbolic then"},{"from":2299.3,"to":2302.63,"location":2,"content":"then we want to work out the meaning of"},{"from":2302.63,"to":2306.62,"location":2,"content":"this phrase and well the second problem"},{"from":2306.62,"to":2309.95,"location":2,"content":"I talked about was surely just having"},{"from":2309.95,"to":2313.03,"location":2,"content":"one way of doing composition is"},{"from":2313.03,"to":2315.62,"location":2,"content":"expecting a lot too much to be able to"},{"from":2315.62,"to":2317.57,"location":2,"content":"have sort of urban object versus"},{"from":2317.57,"to":2320.03,"location":2,"content":"adjective and noun composed the same way"},{"from":2320.03,"to":2323.54,"location":2,"content":"so we had this idea of well since we've"},{"from":2323.54,"to":2325.97,"location":2,"content":"now know about the syntactic categories"},{"from":2325.97,"to":2328.28,"location":2,"content":"of the children that we maybe know that"},{"from":2328.28,"to":2330.53,"location":2,"content":"this is an adjective and this is a noun"},{"from":2330.53,"to":2333.47,"location":2,"content":"what we could do is have different"},{"from":2333.47,"to":2335.71,"location":2,"content":"weight matrices for composition"},{"from":2335.71,"to":2339.11,"location":2,"content":"depending on what the categories are so"},{"from":2339.11,"to":2342.02,"location":2,"content":"rather than where before there was just"},{"from":2342.02,"to":2344.75,"location":2,"content":"this one universal weight matrix which"},{"from":2344.75,"to":2346.94,"location":2,"content":"was meant to do all meaning composition"},{"from":2346.94,"to":2349.61,"location":2,"content":"here we can have this is the weight"},{"from":2349.61,"to":2351.95,"location":2,"content":"matrix for combining together the"},{"from":2351.95,"to":2354.17,"location":2,"content":"meanings of an adjective and a noun and"},{"from":2354.17,"to":2356.63,"location":2,"content":"it will compute the meaning of this"},{"from":2356.63,"to":2358.82,"location":2,"content":"constituent but then we'll have a"},{"from":2358.82,"to":2361.37,"location":2,"content":"different weight matrix for combining"},{"from":2361.37,"to":2364.1,"location":2,"content":"together the meanings of a determiner"},{"from":2364.1,"to":2370.45,"location":2,"content":"and a noun phrase or something like that"},{"from":2370.45,"to":2376.19,"location":2,"content":"okay yeah so I sort of already said this"},{"from":2376.19,"to":2378.74,"location":2,"content":"one I guess we wanted to be able to do"},{"from":2378.74,"to":2382.97,"location":2,"content":"things quickly and so our solution to be"},{"from":2382.97,"to":2385.4,"location":2,"content":"able to do that is we sort of used a"},{"from":2385.4,"to":2387.53,"location":2,"content":"probabilistic context-free grammar to"},{"from":2387.53,"to":2391.22,"location":2,"content":"find likely parsers and then only worked"},{"from":2391.22,"to":2394.4,"location":2,"content":"out our meaning for ones that were quite"},{"from":2394.4,"to":2396.53,"location":2,"content":"probable and so we call this result a"},{"from":2396.53,"to":2399.02,"location":2,"content":"compositional vector grammar which was a"},{"from":2399.02,"to":2401.93,"location":2,"content":"combination of a pcfg and a tree"},{"from":2401.93,"to":2408.64,"location":2,"content":"recursive neural network and yes so"},{"from":2408.64,"to":2411.59,"location":2,"content":"essentially at the time this actually"},{"from":2411.59,"to":2413.36,"location":2,"content":"gave a pretty good constituent"},{"from":2413.36,"to":2415.85,"location":2,"content":"see Pazza so there are sort of lots of"},{"from":2415.85,"to":2417.98,"location":2,"content":"results here the top ones are kind of"},{"from":2417.98,"to":2421.28,"location":2,"content":"our classic older Stanford parser which"},{"from":2421.28,"to":2424.07,"location":2,"content":"is a pcfg are the kind of parsers that"},{"from":2424.07,"to":2425.78,"location":2,"content":"people have built this is our"},{"from":2425.78,"to":2427.55,"location":2,"content":"compositional vector grammar"},{"from":2427.55,"to":2431.96,"location":2,"content":"at the time of this being done in 2013"},{"from":2431.96,"to":2434.06,"location":2,"content":"it wasn't the very best paths are"},{"from":2434.06,"to":2436.04,"location":2,"content":"available there'd been some better work"},{"from":2436.04,"to":2438.41,"location":2,"content":"by Eugene charniak at Brown but we"},{"from":2438.41,"to":2440.21,"location":2,"content":"actually had a pretty good parser"},{"from":2440.21,"to":2442.67,"location":2,"content":"coming out of that system but what was"},{"from":2442.67,"to":2446.66,"location":2,"content":"perhaps a bit more interesting was we we"},{"from":2446.66,"to":2448.79,"location":2,"content":"didn't only have a part of those meant"},{"from":2448.79,"to":2451.19,"location":2,"content":"to give the right parse trees we're also"},{"from":2451.19,"to":2453.86,"location":2,"content":"computing meaning representations of"},{"from":2453.86,"to":2457.82,"location":2,"content":"nodes and as a kind of a consequence of"},{"from":2457.82,"to":2460.16,"location":2,"content":"that you could look at not only meaning"},{"from":2460.16,"to":2462.71,"location":2,"content":"representations of nodes you could learn"},{"from":2462.71,"to":2464.72,"location":2,"content":"about the weight matrices that these"},{"from":2464.72,"to":2467.42,"location":2,"content":"models were learning when they combine"},{"from":2467.42,"to":2469.79,"location":2,"content":"together meanings so remember we have"},{"from":2469.79,"to":2472.4,"location":2,"content":"these sort of category specific W"},{"from":2472.4,"to":2474.74,"location":2,"content":"matrices that were going together with"},{"from":2474.74,"to":2478.28,"location":2,"content":"the children to work out the meaning so"},{"from":2478.28,"to":2480.2,"location":2,"content":"these are a little bit hard to interpret"},{"from":2480.2,"to":2483.23,"location":2,"content":"but the deal is when weilert these"},{"from":2483.23,"to":2485.84,"location":2,"content":"matrices we initialize them as a pair of"},{"from":2485.84,"to":2488.33,"location":2,"content":"diagonal matrices so these are so two by"},{"from":2488.33,"to":2490.7,"location":2,"content":"one rectangular matrices because there"},{"from":2490.7,"to":2495.04,"location":2,"content":"are two children so half of it is"},{"from":2495.04,"to":2497.3,"location":2,"content":"multiplying the left child the other"},{"from":2497.3,"to":2499.49,"location":2,"content":"half is multiplying the right child and"},{"from":2499.49,"to":2501.98,"location":2,"content":"we initialize them as sort of like a"},{"from":2501.98,"to":2505.25,"location":2,"content":"complete two identity matrices next to"},{"from":2505.25,"to":2507.35,"location":2,"content":"each other which would give us the sort"},{"from":2507.35,"to":2510.23,"location":2,"content":"of default semantics of just averaging"},{"from":2510.23,"to":2512.45,"location":2,"content":"until something different was learnt in"},{"from":2512.45,"to":2516.32,"location":2,"content":"the in the in the weight vectors and to"},{"from":2516.32,"to":2518.81,"location":2,"content":"the extent that sort of nothing"},{"from":2518.81,"to":2521.66,"location":2,"content":"interesting has been learnt by the model"},{"from":2521.66,"to":2524.42,"location":2,"content":"you'll get yellow along the diagonal and"},{"from":2524.42,"to":2527.45,"location":2,"content":"this sort of sky-blue in the rest of the"},{"from":2527.45,"to":2530.48,"location":2,"content":"field and the extent that it's learnt"},{"from":2530.48,"to":2532.61,"location":2,"content":"something interesting to take out of the"},{"from":2532.61,"to":2535.13,"location":2,"content":"semantics of a child you'll then start"},{"from":2535.13,"to":2537.8,"location":2,"content":"to see reds and oranges on the diagonal"},{"from":2537.8,"to":2541.13,"location":2,"content":"on dark blues and greens and stuff in"},{"from":2541.13,"to":2543.83,"location":2,"content":"the rest of the field so what you find"},{"from":2543.83,"to":2546.38,"location":2,"content":"is that if you train this model it's"},{"from":2546.38,"to":2547.07,"location":2,"content":"learn"},{"from":2547.07,"to":2552.17,"location":2,"content":"about which children of a phrase are"},{"from":2552.17,"to":2554.84,"location":2,"content":"actually the important ones so these"},{"from":2554.84,"to":2556.91,"location":2,"content":"ones are saying that if you're combining"},{"from":2556.91,"to":2558.32,"location":2,"content":"together a noun phrase and the"},{"from":2558.32,"to":2560.78,"location":2,"content":"coordination so something like the cat"},{"from":2560.78,"to":2563.63,"location":2,"content":"and that most of the semantics is to be"},{"from":2563.63,"to":2566.12,"location":2,"content":"found in the cat and not much of the"},{"from":2566.12,"to":2568.13,"location":2,"content":"semantics is going to be found in and"},{"from":2568.13,"to":2570.68,"location":2,"content":"whereas if you're combining together a"},{"from":2570.68,"to":2573.98,"location":2,"content":"possessive pronoun something like her or"},{"from":2573.98,"to":2578.77,"location":2,"content":"hears with a noun phrase insider like"},{"from":2578.77,"to":2582.53,"location":2,"content":"her tabby cat or something like that and"},{"from":2582.53,"to":2584.12,"location":2,"content":"most of the meaning is to be found"},{"from":2584.12,"to":2586.91,"location":2,"content":"inside the tabby cat constituent so it's"},{"from":2586.91,"to":2588.56,"location":2,"content":"actually learning where the important"},{"from":2588.56,"to":2593.3,"location":2,"content":"semantics of sentences is and there are"},{"from":2593.3,"to":2598.94,"location":2,"content":"lots of examples of that yeah this one"},{"from":2598.94,"to":2601.73,"location":2,"content":"sort of so this one shows a variety of"},{"from":2601.73,"to":2604.34,"location":2,"content":"modification structures where adjectives"},{"from":2604.34,"to":2609.44,"location":2,"content":"or adverbs modify either a noun phrase"},{"from":2609.44,"to":2612.4,"location":2,"content":"or an adjective phrase or just a single"},{"from":2612.4,"to":2615.26,"location":2,"content":"adjective ISM multiplying a noun phrase"},{"from":2615.26,"to":2617.93,"location":2,"content":"and the thing that you seem to notice is"},{"from":2617.93,"to":2619.67,"location":2,"content":"that there are particular dimensions"},{"from":2619.67,"to":2622.54,"location":2,"content":"which are kind of capturing sort of"},{"from":2622.54,"to":2625.22,"location":2,"content":"modification meaning so dimension six"},{"from":2625.22,"to":2628.73,"location":2,"content":"and dimension eleven is sort of showing"},{"from":2628.73,"to":2631.97,"location":2,"content":"up in these different combinations here"},{"from":2631.97,"to":2633.86,"location":2,"content":"sort of capturing meaning component so"},{"from":2633.86,"to":2636.74,"location":2,"content":"that was kind of neat and so this"},{"from":2636.74,"to":2638.8,"location":2,"content":"slightly more complex model actually"},{"from":2638.8,"to":2642.11,"location":2,"content":"worked pretty well at capturing a"},{"from":2642.11,"to":2645.41,"location":2,"content":"meaning of phrases and sentences so in"},{"from":2645.41,"to":2648.47,"location":2,"content":"this test here we were giving it at the"},{"from":2648.47,"to":2651.65,"location":2,"content":"system a test sentence and saying well"},{"from":2651.65,"to":2655.31,"location":2,"content":"what are the other what are sentences"},{"from":2655.31,"to":2658.43,"location":2,"content":"that are most similar in meaning nearest"},{"from":2658.43,"to":2661.49,"location":2,"content":"to paraphrases in our corpus for this"},{"from":2661.49,"to":2663.65,"location":2,"content":"sentence so far all the figures are"},{"from":2663.65,"to":2666.38,"location":2,"content":"adjusted for seasonal variations the two"},{"from":2666.38,"to":2668.66,"location":2,"content":"most similar other sentences in the"},{"from":2668.66,"to":2671.03,"location":2,"content":"corpus were all the numbers are adjusted"},{"from":2671.03,"to":2673.28,"location":2,"content":"for seasonal vet fluctuation that's a"},{"from":2673.28,"to":2675.2,"location":2,"content":"pretty easy one or all the figures are"},{"from":2675.2,"to":2677.18,"location":2,"content":"adjusted to remove usual seasonal"},{"from":2677.18,"to":2679.07,"location":2,"content":"patterns so that seems to be working"},{"from":2679.07,"to":2680.63,"location":2,"content":"pretty well night"},{"from":2680.63,"to":2683.63,"location":2,"content":"you know one comment on the offer haskó"},{"from":2683.63,"to":2685.67,"location":2,"content":"declined to say what country placed the"},{"from":2685.67,"to":2687.86,"location":2,"content":"order the semantics they're a bit more"},{"from":2687.86,"to":2689.3,"location":2,"content":"different but it seems like it is"},{"from":2689.3,"to":2691.64,"location":2,"content":"capturing something similar"},{"from":2691.64,"to":2693.59,"location":2,"content":"coastal wouldn't disclose the terms"},{"from":2693.59,"to":2695.45,"location":2,"content":"that's kind of a really interesting one"},{"from":2695.45,"to":2697.4,"location":2,"content":"because that one is actually very"},{"from":2697.4,"to":2699.92,"location":2,"content":"similar in meaning but it's expressed in"},{"from":2699.92,"to":2702.14,"location":2,"content":"a very different way in terms of the"},{"from":2702.14,"to":2704.27,"location":2,"content":"words and the syntactic structure that"},{"from":2704.27,"to":2705.88,"location":2,"content":"are used"},{"from":2705.88,"to":2709.7,"location":2,"content":"okay so that was progress because now we"},{"from":2709.7,"to":2712.34,"location":2,"content":"could have different matrices for"},{"from":2712.34,"to":2715.82,"location":2,"content":"different constituent types but there's"},{"from":2715.82,"to":2719.03,"location":2,"content":"still some reason to think that we"},{"from":2719.03,"to":2722.87,"location":2,"content":"didn't have enough power and that was we"},{"from":2722.87,"to":2725.18,"location":2,"content":"are still at heart using this very"},{"from":2725.18,"to":2728.66,"location":2,"content":"simple compositional structure we were"},{"from":2728.66,"to":2733.01,"location":2,"content":"just concatenated to children's vectors"},{"from":2733.01,"to":2735.32,"location":2,"content":"and multiplying it by a matrix so that"},{"from":2735.32,"to":2739.07,"location":2,"content":"means the two words didn't interact with"},{"from":2739.07,"to":2743.71,"location":2,"content":"each other in terms of their meaning but"},{"from":2743.71,"to":2747.14,"location":2,"content":"it seems like we want to have them"},{"from":2747.14,"to":2749.63,"location":2,"content":"interact in their meaning right so in"},{"from":2749.63,"to":2753.8,"location":2,"content":"particular if you if you think about"},{"from":2753.8,"to":2756.71,"location":2,"content":"human languages and the kind of things"},{"from":2756.71,"to":2758.27,"location":2,"content":"that people look at at linguistic"},{"from":2758.27,"to":2761.63,"location":2,"content":"semantics you get words that appear to"},{"from":2761.63,"to":2765.23,"location":2,"content":"be kind of modifiers or operators so the"},{"from":2765.23,"to":2768.86,"location":2,"content":"word very sort of doesn't mean much by"},{"from":2768.86,"to":2770.77,"location":2,"content":"itself I mean it means something like"},{"from":2770.77,"to":2774.14,"location":2,"content":"strengthening or more so or something"},{"from":2774.14,"to":2776.45,"location":2,"content":"like that but you know it doesn't really"},{"from":2776.45,"to":2778.76,"location":2,"content":"have a meaning right it doesn't have any"},{"from":2778.76,"to":2781.82,"location":2,"content":"D notation you can't show me very things"},{"from":2781.82,"to":2783.77,"location":2,"content":"right you can show me chairs and pins"},{"from":2783.77,"to":2787.22,"location":2,"content":"and children but you can't show me very"},{"from":2787.22,"to":2789.77,"location":2,"content":"things that the meaning of very seems to"},{"from":2789.77,"to":2792.26,"location":2,"content":"be that something comes after it good"},{"from":2792.26,"to":2795.44,"location":2,"content":"and this has a sort of an operator"},{"from":2795.44,"to":2798.68,"location":2,"content":"meaning of increase on the scale this"},{"from":2798.68,"to":2801.02,"location":2,"content":"thing and it can increase on the scale"},{"from":2801.02,"to":2803.15,"location":2,"content":"in either direction you can have very"},{"from":2803.15,"to":2806.48,"location":2,"content":"good or very bad so if we want to"},{"from":2806.48,"to":2810.47,"location":2,"content":"capture that kind of semantics it seems"},{"from":2810.47,"to":2812.15,"location":2,"content":"like we can't capture that kind of"},{"from":2812.15,"to":2813.93,"location":2,"content":"semantics by just can-can"},{"from":2813.93,"to":2816.66,"location":2,"content":"needing two vectors and multiplying them"},{"from":2816.66,"to":2819.9,"location":2,"content":"by a matrix it seems like what we really"},{"from":2819.9,"to":2824.13,"location":2,"content":"want to say is very is going to grab"},{"from":2824.13,"to":2827.4,"location":2,"content":"hold of the meaning of good and modify"},{"from":2827.4,"to":2829.68,"location":2,"content":"it in some ways to produce a new meaning"},{"from":2829.68,"to":2832.98,"location":2,"content":"for very good and indeed that's the kind"},{"from":2832.98,"to":2835.98,"location":2,"content":"of approach that's typically been done"},{"from":2835.98,"to":2839.43,"location":2,"content":"in linguistic semantics so in linguistic"},{"from":2839.43,"to":2841.23,"location":2,"content":"theories of semantics you'd normally say"},{"from":2841.23,"to":2844.26,"location":2,"content":"okay good has a meaning very as a"},{"from":2844.26,"to":2846.15,"location":2,"content":"function that takes in the meaning of"},{"from":2846.15,"to":2849.39,"location":2,"content":"good and returns a meaning very good and"},{"from":2849.39,"to":2852.87,"location":2,"content":"so we wanted to have a way of putting"},{"from":2852.87,"to":2855.81,"location":2,"content":"that into a neural network and so to try"},{"from":2855.81,"to":2857.91,"location":2,"content":"and come up with a new composition"},{"from":2857.91,"to":2861.45,"location":2,"content":"function as to how to do that and there"},{"from":2861.45,"to":2863.97,"location":2,"content":"are various ways that you could think"},{"from":2863.97,"to":2866.28,"location":2,"content":"about doing that and other people have"},{"from":2866.28,"to":2868.94,"location":2,"content":"had a couple of different attempts but"},{"from":2868.94,"to":2872.37,"location":2,"content":"essentially what was in our head is well"},{"from":2872.37,"to":2875.82,"location":2,"content":"we have word vectors and if we want to"},{"from":2875.82,"to":2879.15,"location":2,"content":"say that 'very takes the meaning of good"},{"from":2879.15,"to":2882.75,"location":2,"content":"and returns a new meaning the kind of"},{"from":2882.75,"to":2885.6,"location":2,"content":"obvious thing to do is to say very has a"},{"from":2885.6,"to":2887.88,"location":2,"content":"matrix attached to it because then we"},{"from":2887.88,"to":2890.88,"location":2,"content":"can use the the very matrix and multiply"},{"from":2890.88,"to":2893.81,"location":2,"content":"it by the good vector and we get a new"},{"from":2893.81,"to":2899.73,"location":2,"content":"vector coming out and so then well the"},{"from":2899.73,"to":2903.21,"location":2,"content":"problem is which in which words have"},{"from":2903.21,"to":2905.4,"location":2,"content":"vectors and which words have matrices"},{"from":2905.4,"to":2908.85,"location":2,"content":"and that's kind of hard to know the"},{"from":2908.85,"to":2913.47,"location":2,"content":"answer to I mean in particular words"},{"from":2913.47,"to":2916.5,"location":2,"content":"that act as operators can often"},{"from":2916.5,"to":2924.03,"location":2,"content":"themselves be modified and so that you"},{"from":2924.03,"to":2928.07,"location":2,"content":"know good can also good also is a"},{"from":2928.07,"to":2931.64,"location":2,"content":"operator right so that from a sort of a"},{"from":2931.64,"to":2934.38,"location":2,"content":"person you can have a good person that's"},{"from":2934.38,"to":2936.84,"location":2,"content":"sort of also an operator and very is"},{"from":2936.84,"to":2939.81,"location":2,"content":"modifying that good so the idea we came"},{"from":2939.81,"to":2941.61,"location":2,"content":"up with is let's not try and"},{"from":2941.61,"to":2944.01,"location":2,"content":"predetermine all of this why don't we"},{"from":2944.01,"to":2946.69,"location":2,"content":"say that every word and"},{"from":2946.69,"to":2950.5,"location":2,"content":"phrase has connected to it both a matrix"},{"from":2950.5,"to":2954.28,"location":2,"content":"and the vector so here's our very good"},{"from":2954.28,"to":2955.06,"location":2,"content":"movie"},{"from":2955.06,"to":2957.43,"location":2,"content":"so for each word we have a vector"},{"from":2957.43,"to":2959.86,"location":2,"content":"meaning and it has a matrix meaning and"},{"from":2959.86,"to":2962.2,"location":2,"content":"then as we start to build up phrases"},{"from":2962.2,"to":2965.11,"location":2,"content":"like very good they're also going to"},{"from":2965.11,"to":2967.36,"location":2,"content":"have a vector meaning and a matrix"},{"from":2967.36,"to":2973.36,"location":2,"content":"meaning and so what we proposed was so"},{"from":2973.36,"to":2975.94,"location":2,"content":"first of all we we would like to be able"},{"from":2975.94,"to":2982.57,"location":2,"content":"to calculate the vector meanings so to"},{"from":2982.57,"to":2985.39,"location":2,"content":"work out the vector meaning of a phrase"},{"from":2985.39,"to":2988.6,"location":2,"content":"like very good each word has a matrix"},{"from":2988.6,"to":2990.88,"location":2,"content":"meaning and so we're going to combine"},{"from":2990.88,"to":2992.83,"location":2,"content":"their opposing matrix and vector"},{"from":2992.83,"to":2995.62,"location":2,"content":"meanings so we're going to take the"},{"from":2995.62,"to":2998.23,"location":2,"content":"matrix meaning of good and multiply it"},{"from":2998.23,"to":3000.93,"location":2,"content":"by the vector meaning of 'very and we're"},{"from":3000.93,"to":3002.91,"location":2,"content":"going to take the matrix meaning of"},{"from":3002.91,"to":3005.31,"location":2,"content":"'very and multiply it by the vector"},{"from":3005.31,"to":3008.37,"location":2,"content":"meaning of good and so we're going to"},{"from":3008.37,"to":3011.67,"location":2,"content":"have both of those two things and then"},{"from":3011.67,"to":3013.35,"location":2,"content":"we're going to have a neural network"},{"from":3013.35,"to":3016.68,"location":2,"content":"layer like before that combine those"},{"from":3016.68,"to":3018.63,"location":2,"content":"together and so that sort of in the red"},{"from":3018.63,"to":3020.79,"location":2,"content":"box then those two things were"},{"from":3020.79,"to":3023.04,"location":2,"content":"concatenated and put through the kind of"},{"from":3023.04,"to":3025.38,"location":2,"content":"neural network layer we had before to"},{"from":3025.38,"to":3027.99,"location":2,"content":"give us a final vector meaning for this"},{"from":3027.99,"to":3032.16,"location":2,"content":"for the phrase and then we also needed a"},{"from":3032.16,"to":3035.31,"location":2,"content":"matrix meaning for the phrase and so for"},{"from":3035.31,"to":3039.3,"location":2,"content":"the matrix meaning for the fray is we"},{"from":3039.3,"to":3041.13,"location":2,"content":"did this kind of simple model which"},{"from":3041.13,"to":3043.23,"location":2,"content":"maybe actually wasn't very good which"},{"from":3043.23,"to":3047.55,"location":2,"content":"was to say let's just concatenate the"},{"from":3047.55,"to":3051.92,"location":2,"content":"two matrices of the the constituents"},{"from":3051.92,"to":3054.96,"location":2,"content":"multiply them by another matrix and"},{"from":3054.96,"to":3057.05,"location":2,"content":"that's then going to give us a matrix"},{"from":3057.05,"to":3060.66,"location":2,"content":"version of the parent node and so this"},{"from":3060.66,"to":3063.21,"location":2,"content":"was gave us our new more company more"},{"from":3063.21,"to":3066.87,"location":2,"content":"powerful composition procedure this did"},{"from":3066.87,"to":3069.99,"location":2,"content":"seem like it could do some kind of good"},{"from":3069.99,"to":3073.29,"location":2,"content":"things that captured a sort of operator"},{"from":3073.29,"to":3076.2,"location":2,"content":"semantics where one word modified the"},{"from":3076.2,"to":3079.83,"location":2,"content":"meaning of another word so here's a kind"},{"from":3079.83,"to":3080.65,"location":2,"content":"of a neat"},{"from":3080.65,"to":3086.89,"location":2,"content":"that we were able to do with this that"},{"from":3086.89,"to":3089.8,"location":2,"content":"we are wanting to be able to work out"},{"from":3089.8,"to":3093.31,"location":2,"content":"the semantics of an operator modifying"},{"from":3093.31,"to":3096.36,"location":2,"content":"another word so unbelievably annoying"},{"from":3096.36,"to":3100.44,"location":2,"content":"unbelievably awesome unbelievably sad"},{"from":3100.44,"to":3104.29,"location":2,"content":"not annoying not awesome not sad"},{"from":3104.29,"to":3110.65,"location":2,"content":"and so this was contrasting our old"},{"from":3110.65,"to":3114.67,"location":2,"content":"model versus the new model and this"},{"from":3114.67,"to":3117.54,"location":2,"content":"scale is a scale of positive to negative"},{"from":3117.54,"to":3120.76,"location":2,"content":"so this is completely negative to"},{"from":3120.76,"to":3123.88,"location":2,"content":"completely positive right and so the"},{"from":3123.88,"to":3130.15,"location":2,"content":"kind of contrasts you get that for not"},{"from":3130.15,"to":3133.27,"location":2,"content":"annoying that the simple model thought"},{"from":3133.27,"to":3136,"location":2,"content":"that this is pretty negative whereas the"},{"from":3136,"to":3138.13,"location":2,"content":"new model thinks this is pretty neutral"},{"from":3138.13,"to":3140.28,"location":2,"content":"and meaning and that seems to be"},{"from":3140.28,"to":3145.12,"location":2,"content":"reasonably correct for not sad that"},{"from":3145.12,"to":3147.52,"location":2,"content":"means it's a little bit positive and"},{"from":3147.52,"to":3150.52,"location":2,"content":"both models we're trying to kept capture"},{"from":3150.52,"to":3153.46,"location":2,"content":"that then you know the results here a"},{"from":3153.46,"to":3155.59,"location":2,"content":"little bit ambivalent but it sort of"},{"from":3155.59,"to":3157.45,"location":2,"content":"seems that they sort of go a little bit"},{"from":3157.45,"to":3163.29,"location":2,"content":"in the direction of what we want yes"},{"from":3163.29,"to":3167.05,"location":2,"content":"this ground truth was we actually asked"},{"from":3167.05,"to":3169.38,"location":2,"content":"a whole bunch of human beings to say"},{"from":3169.38,"to":3173.68,"location":2,"content":"race the meaning of not sad on this"},{"from":3173.68,"to":3176.02,"location":2,"content":"scale of 1 to 10 maybe this wasn't a"},{"from":3176.02,"to":3177.91,"location":2,"content":"very good clear task because as you can"},{"from":3177.91,"to":3179.95,"location":2,"content":"see it bounced around a lot"},{"from":3179.95,"to":3184.18,"location":2,"content":"uh-huh what kind of ratings we were"},{"from":3184.18,"to":3186.28,"location":2,"content":"getting for things but yeah that was"},{"from":3186.28,"to":3190.02,"location":2,"content":"actually going to gain human judgments"},{"from":3190.02,"to":3195.16,"location":2,"content":"we also then used this model to say well"},{"from":3195.16,"to":3198.19,"location":2,"content":"could we do semantic classification"},{"from":3198.19,"to":3201.03,"location":2,"content":"tasks so if we wanted to understand"},{"from":3201.03,"to":3204.18,"location":2,"content":"relations between different noun phrases"},{"from":3204.18,"to":3208.96,"location":2,"content":"so this was a data set where there were"},{"from":3208.96,"to":3211.63,"location":2,"content":"relations marked between two noun"},{"from":3211.63,"to":3214,"location":2,"content":"phrases my apartment has a"},{"from":3214,"to":3216.67,"location":2,"content":"pretty large kitchen that was seen as an"},{"from":3216.67,"to":3220.45,"location":2,"content":"example of a component whole a a part of"},{"from":3220.45,"to":3222.67,"location":2,"content":"relationship between the two noun"},{"from":3222.67,"to":3224.56,"location":2,"content":"phrases and there were other"},{"from":3224.56,"to":3227.65,"location":2,"content":"relationships between different kinds of"},{"from":3227.65,"to":3230.56,"location":2,"content":"noun phrases so for us the movie showed"},{"from":3230.56,"to":3234.19,"location":2,"content":"wars that that was then a message topic"},{"from":3234.19,"to":3236.5,"location":2,"content":"so there's some communication medium"},{"from":3236.5,"to":3239.11,"location":2,"content":"that contains some topic relationship"},{"from":3239.11,"to":3242.2,"location":2,"content":"and so we were using this kind of neural"},{"from":3242.2,"to":3244.14,"location":2,"content":"network to sort of build our meaning"},{"from":3244.14,"to":3246.58,"location":2,"content":"representations and then putting them"},{"from":3246.58,"to":3249.01,"location":2,"content":"through another neural network layer as"},{"from":3249.01,"to":3252.31,"location":2,"content":"a classifier to see how well we did and"},{"from":3252.31,"to":3254.53,"location":2,"content":"so we got some sort of fairly good"},{"from":3254.53,"to":3256.75,"location":2,"content":"results on that so this was a data set"},{"from":3256.75,"to":3258.88,"location":2,"content":"that people had worked on with"},{"from":3258.88,"to":3261.97,"location":2,"content":"traditional NLP systems of different"},{"from":3261.97,"to":3264.46,"location":2,"content":"kinds of machine learning methods but in"},{"from":3264.46,"to":3266.74,"location":2,"content":"some sense you know what we're"},{"from":3266.74,"to":3268.6,"location":2,"content":"interested in was we seem to be making"},{"from":3268.6,"to":3270.61,"location":2,"content":"progress in having a better semantic"},{"from":3270.61,"to":3272.88,"location":2,"content":"composition system that our old"},{"from":3272.88,"to":3275.38,"location":2,"content":"recursive neural network was getting"},{"from":3275.38,"to":3278.38,"location":2,"content":"about 75% and then our new one was"},{"from":3278.38,"to":3281.08,"location":2,"content":"getting about 79% which we could sort of"},{"from":3281.08,"to":3283.48,"location":2,"content":"push up further by putting more features"},{"from":3283.48,"to":3288.73,"location":2,"content":"into our system so that was progress but"},{"from":3288.73,"to":3290.59,"location":2,"content":"we didn't stop there"},{"from":3290.59,"to":3292.87,"location":2,"content":"and we kept on trying to come up with"},{"from":3292.87,"to":3296.56,"location":2,"content":"better ways of doing things and so even"},{"from":3296.56,"to":3299.74,"location":2,"content":"though things worked fairly well here it"},{"from":3299.74,"to":3302.94,"location":2,"content":"sort of seemed like this way of doing"},{"from":3302.94,"to":3307.63,"location":2,"content":"matrices wasn't necessarily very good it"},{"from":3307.63,"to":3311.17,"location":2,"content":"sort of had two problems one problem was"},{"from":3311.17,"to":3314.68,"location":2,"content":"it introduced a humongous number of"},{"from":3314.68,"to":3316.72,"location":2,"content":"parameters because you know for just"},{"from":3316.72,"to":3318.06,"location":2,"content":"about everything that we've done"},{"from":3318.06,"to":3321.7,"location":2,"content":"otherwise words have had a vector and"},{"from":3321.7,"to":3324.73,"location":2,"content":"well maybe sometimes we use quite high"},{"from":3324.73,"to":3326.77,"location":2,"content":"dimensional vectors like a thousand and"},{"from":3326.77,"to":3329.5,"location":2,"content":"twenty four but you know that's a"},{"from":3329.5,"to":3331.54,"location":2,"content":"relatively modest number of parameters"},{"from":3331.54,"to":3334.45,"location":2,"content":"whereas once we introduced this matrix"},{"from":3334.45,"to":3337.26,"location":2,"content":"here we've got that number of squared"},{"from":3337.26,"to":3340.95,"location":2,"content":"additional parameters for every word and"},{"from":3340.95,"to":3343.24,"location":2,"content":"essentially because of that number of"},{"from":3343.24,"to":3345.52,"location":2,"content":"parameters to be able to compute this"},{"from":3345.52,"to":3347.82,"location":2,"content":"model at all we were making the vet"},{"from":3347.82,"to":3349.89,"location":2,"content":"the size small so what we're actually"},{"from":3349.89,"to":3351.9,"location":2,"content":"using was of these were just 25"},{"from":3351.9,"to":3354.9,"location":2,"content":"dimensional vectors so that the 25"},{"from":3354.9,"to":3358.92,"location":2,"content":"squared 625 still saved sort of decently"},{"from":3358.92,"to":3360.63,"location":2,"content":"within the range in which we could"},{"from":3360.63,"to":3363.21,"location":2,"content":"compute so that was the first problem"},{"from":3363.21,"to":3366.12,"location":2,"content":"the second problem is we didn't really"},{"from":3366.12,"to":3369.87,"location":2,"content":"have very good ways I'm sort of building"},{"from":3369.87,"to":3372.72,"location":2,"content":"up the matrix meaning of bigger phrases"},{"from":3372.72,"to":3375.03,"location":2,"content":"I mean you know this sort of seems"},{"from":3375.03,"to":3377.34,"location":2,"content":"something simple we could do but it"},{"from":3377.34,"to":3379.59,"location":2,"content":"didn't you know feel a very good way of"},{"from":3379.59,"to":3382.2,"location":2,"content":"getting a matrix meaning of a phrase so"},{"from":3382.2,"to":3383.67,"location":2,"content":"he sort of wanted to come up with some"},{"from":3383.67,"to":3385.35,"location":2,"content":"other way of doing things"},{"from":3385.35,"to":3387.65,"location":2,"content":"that could fix both of those problems"},{"from":3387.65,"to":3390.92,"location":2,"content":"and then that led in to work on"},{"from":3390.92,"to":3394.29,"location":2,"content":"recursive neural tenser networks and"},{"from":3394.29,"to":3396.84,"location":2,"content":"there's a kind of a nice idea here of"},{"from":3396.84,"to":3400.29,"location":2,"content":"these neural tensors which is an idea"},{"from":3400.29,"to":3401.85,"location":2,"content":"that's actually being used in other"},{"from":3401.85,"to":3406.41,"location":2,"content":"places including work on sort of putting"},{"from":3406.41,"to":3408.54,"location":2,"content":"vector embeddings of knowledge graphs"},{"from":3408.54,"to":3410.7,"location":2,"content":"and so on which is a kind of a bit of a"},{"from":3410.7,"to":3413.58,"location":2,"content":"nice idea so I wanted to sort of show a"},{"from":3413.58,"to":3417.15,"location":2,"content":"bit of how this model works and but just"},{"from":3417.15,"to":3420.03,"location":2,"content":"to say first a place where we applied"},{"from":3420.03,"to":3422.16,"location":2,"content":"this model was on the problem of"},{"from":3422.16,"to":3424.77,"location":2,"content":"sentiment analysis now I think the term"},{"from":3424.77,"to":3426.78,"location":2,"content":"sentiment analysis has come up a few"},{"from":3426.78,"to":3429.36,"location":2,"content":"times as something you can do and"},{"from":3429.36,"to":3432.44,"location":2,"content":"actually Richard and mention the last"},{"from":3432.44,"to":3435.21,"location":2,"content":"lecture but I think we've never really"},{"from":3435.21,"to":3438.21,"location":2,"content":"talked for five minutes in this class on"},{"from":3438.21,"to":3440.97,"location":2,"content":"sentiment analysis so I'll give you this"},{"from":3440.97,"to":3444.09,"location":2,"content":"as an example of that sentiment analysis"},{"from":3444.09,"to":3446.22,"location":2,"content":"has actually been a really common and"},{"from":3446.22,"to":3449.34,"location":2,"content":"important application in natural"},{"from":3449.34,"to":3452.13,"location":2,"content":"language processing you're looking at a"},{"from":3452.13,"to":3454.47,"location":2,"content":"piece of text and you're sort of saying"},{"from":3454.47,"to":3459.15,"location":2,"content":"is it positive or negative and that's"},{"from":3459.15,"to":3460.86,"location":2,"content":"just something that's very useful for"},{"from":3460.86,"to":3463.68,"location":2,"content":"lots of commercial applications of"},{"from":3463.68,"to":3466.05,"location":2,"content":"looking at product reviews or doing"},{"from":3466.05,"to":3468.96,"location":2,"content":"brand awareness and things like that of"},{"from":3468.96,"to":3470.58,"location":2,"content":"sort of looking at sentiment connected"},{"from":3470.58,"to":3473.1,"location":2,"content":"to things and to some extent doing"},{"from":3473.1,"to":3475.68,"location":2,"content":"sentiment analysis is easy right that"},{"from":3475.68,"to":3478.11,"location":2,"content":"you can kind of say well look at a piece"},{"from":3478.11,"to":3480.54,"location":2,"content":"of text if you see words like loved"},{"from":3480.54,"to":3481.66,"location":2,"content":"great impressed"},{"from":3481.66,"to":3484.42,"location":2,"content":"and it's positive it's a positive review"},{"from":3484.42,"to":3486.97,"location":2,"content":"and if it's saying bad and awful then"},{"from":3486.97,"to":3489.07,"location":2,"content":"it's a negative review and to some"},{"from":3489.07,"to":3491.2,"location":2,"content":"extent that's the baseline of sentiment"},{"from":3491.2,"to":3494.5,"location":2,"content":"analysis that you can use just either"},{"from":3494.5,"to":3497.59,"location":2,"content":"selected word features or all words in a"},{"from":3497.59,"to":3500.17,"location":2,"content":"bag of words and if you do that you"},{"from":3500.17,"to":3503.46,"location":2,"content":"don't actually do that badly in"},{"from":3503.46,"to":3505.78,"location":2,"content":"sentiment analysis if you have longer"},{"from":3505.78,"to":3508,"location":2,"content":"documents just looking at bags of words"},{"from":3508,"to":3510.4,"location":2,"content":"can give you 90% in sentiment analysis"},{"from":3510.4,"to":3513.4,"location":2,"content":"but on the other hand things often do"},{"from":3513.4,"to":3516.7,"location":2,"content":"get trickier right so this is from"},{"from":3516.7,"to":3519.25,"location":2,"content":"Rotten Tomatoes with this cast and this"},{"from":3519.25,"to":3521.26,"location":2,"content":"subject matter the movie should have"},{"from":3521.26,"to":3523.39,"location":2,"content":"been funnier and more entertaining and"},{"from":3523.39,"to":3525.79,"location":2,"content":"if you sort of pretend you're a bag of"},{"from":3525.79,"to":3529.03,"location":2,"content":"words model the only words in this that"},{"from":3529.03,"to":3531.55,"location":2,"content":"are sort of clearly sentiment Laden"},{"from":3531.55,"to":3535.99,"location":2,"content":"words entertaining and funnier and both"},{"from":3535.99,"to":3538.99,"location":2,"content":"of those are pretty positive words but"},{"from":3538.99,"to":3541.27,"location":2,"content":"it's fairly obvious that this actually"},{"from":3541.27,"to":3543.73,"location":2,"content":"isn't meant to be a bad review of the"},{"from":3543.73,"to":3546.64,"location":2,"content":"movie and so well how are we meant to"},{"from":3546.64,"to":3548.68,"location":2,"content":"know that well it sort of seems again"},{"from":3548.68,"to":3550.3,"location":2,"content":"like what we have to do as meaning"},{"from":3550.3,"to":3553.26,"location":2,"content":"composition we have to get sort of"},{"from":3553.26,"to":3555.73,"location":2,"content":"phrases like should have been funnier"},{"from":3555.73,"to":3558.46,"location":2,"content":"and then realized that that's actually a"},{"from":3558.46,"to":3561.61,"location":2,"content":"negative meaning for a phrase and so we"},{"from":3561.61,"to":3564.07,"location":2,"content":"wanted to explore how we could look at"},{"from":3564.07,"to":3567.58,"location":2,"content":"those sort of meanings for phrases and"},{"from":3567.58,"to":3569.92,"location":2,"content":"explore building up those meanings as"},{"from":3569.92,"to":3573.97,"location":2,"content":"doing meaning composition over trees um"},{"from":3573.97,"to":3577.51,"location":2,"content":"so the first thing we did was we built a"},{"from":3577.51,"to":3580.69,"location":2,"content":"tree Bank of sentiment trees where we"},{"from":3580.69,"to":3583.03,"location":2,"content":"got people to rate sentiment and so this"},{"from":3583.03,"to":3585.61,"location":2,"content":"led to the Stanford sentiment tree Bank"},{"from":3585.61,"to":3588.19,"location":2,"content":"which is still a data set you often see"},{"from":3588.19,"to":3592.48,"location":2,"content":"used in various of evaluations with a"},{"from":3592.48,"to":3594.28,"location":2,"content":"whole bunch of data sets indeed that"},{"from":3594.28,"to":3597.73,"location":2,"content":"showed up in Decorah NLP last week so"},{"from":3597.73,"to":3601.59,"location":2,"content":"what we were doing this was taking"},{"from":3601.59,"to":3603.96,"location":2,"content":"sentences which were Rotten Tomatoes"},{"from":3603.96,"to":3607.36,"location":2,"content":"sentences from movies we were parsing"},{"from":3607.36,"to":3609.85,"location":2,"content":"them to give tree structure and then we"},{"from":3609.85,"to":3613.69,"location":2,"content":"were asking Mechanical Turk URLs to rate"},{"from":3613.69,"to":3614.57,"location":2,"content":"the"},{"from":3614.57,"to":3616.67,"location":2,"content":"different free the different words and"},{"from":3616.67,"to":3619.67,"location":2,"content":"phrases on a sentiment scale of very"},{"from":3619.67,"to":3622.13,"location":2,"content":"positive to very negative so lots of"},{"from":3622.13,"to":3624.35,"location":2,"content":"stuff is white because it's just not"},{"from":3624.35,"to":3626.75,"location":2,"content":"sentiment Laden right there's words that"},{"from":3626.75,"to":3629.09,"location":2,"content":"are there and there's phrases like the"},{"from":3629.09,"to":3631.7,"location":2,"content":"movie and the movie was which don't"},{"from":3631.7,"to":3633.8,"location":2,"content":"really have any sentiment but then you"},{"from":3633.8,"to":3635.9,"location":2,"content":"have pieces of sort of very positive"},{"from":3635.9,"to":3639.11,"location":2,"content":"pieces of tree and negative pieces of"},{"from":3639.11,"to":3641.3,"location":2,"content":"tree that are then shown in the blue and"},{"from":3641.3,"to":3644.21,"location":2,"content":"the red and so typically in sentiment"},{"from":3644.21,"to":3647.24,"location":2,"content":"datasets people have only labeled the"},{"from":3647.24,"to":3650.24,"location":2,"content":"entire sentence to say this is a"},{"from":3650.24,"to":3652.37,"location":2,"content":"positive sentence or a very positive"},{"from":3652.37,"to":3654.35,"location":2,"content":"sentence this is a negative sentence or"},{"from":3654.35,"to":3656.9,"location":2,"content":"a very negative sentence crucially what"},{"from":3656.9,"to":3659.72,"location":2,"content":"we were doing differently here is every"},{"from":3659.72,"to":3662.42,"location":2,"content":"phrase in the sentence according to our"},{"from":3662.42,"to":3665.03,"location":2,"content":"tree structure was being given a label"},{"from":3665.03,"to":3668.93,"location":2,"content":"for its positivity or negativity and"},{"from":3668.93,"to":3671.6,"location":2,"content":"perhaps not surprisingly just the fact"},{"from":3671.6,"to":3673.19,"location":2,"content":"that you have a lot more annotations"},{"from":3673.19,"to":3677.54,"location":2,"content":"like that just improves the behavior of"},{"from":3677.54,"to":3680.54,"location":2,"content":"classifiers because you kind of can do"},{"from":3680.54,"to":3681.04,"location":2,"content":"better"},{"from":3681.04,"to":3683.9,"location":2,"content":"attribution of which words in a sentence"},{"from":3683.9,"to":3688.7,"location":2,"content":"a positive or negative so these were"},{"from":3688.7,"to":3692.39,"location":2,"content":"were results of sort of preceding models"},{"from":3692.39,"to":3695.98,"location":2,"content":"so the green is a naive Bayes model"},{"from":3695.98,"to":3699.35,"location":2,"content":"except it not only uses individual words"},{"from":3699.35,"to":3702.32,"location":2,"content":"but it uses pairs of words it turns out"},{"from":3702.32,"to":3703.75,"location":2,"content":"if you're building at a traditional"},{"from":3703.75,"to":3706.61,"location":2,"content":"classifier and you want to do sentiment"},{"from":3706.61,"to":3708.32,"location":2,"content":"analysis as opposed to something like"},{"from":3708.32,"to":3711.05,"location":2,"content":"topic classification you get a lot"},{"from":3711.05,"to":3713.51,"location":2,"content":"better results if you also use word pair"},{"from":3713.51,"to":3715.49,"location":2,"content":"features and that's because it does a"},{"from":3715.49,"to":3719,"location":2,"content":"baby bit of composition for you you"},{"from":3719,"to":3720.92,"location":2,"content":"don't only have features for not an"},{"from":3720.92,"to":3722.63,"location":2,"content":"interesting but you can have a feature"},{"from":3722.63,"to":3725.39,"location":2,"content":"for not interesting and that lets you"},{"from":3725.39,"to":3727.91,"location":2,"content":"model a certain amount of stuff and then"},{"from":3727.91,"to":3729.74,"location":2,"content":"these are our older generations of"},{"from":3729.74,"to":3732.05,"location":2,"content":"neural networks our original tree"},{"from":3732.05,"to":3733.91,"location":2,"content":"structured neural network and our matrix"},{"from":3733.91,"to":3737.54,"location":2,"content":"vector one and so simply having for"},{"from":3737.54,"to":3739.79,"location":2,"content":"these so fixed models simply having the"},{"from":3739.79,"to":3742.64,"location":2,"content":"richer supervision that comes from our"},{"from":3742.64,"to":3744.92,"location":2,"content":"new tree Bank it sort of moved up the"},{"from":3744.92,"to":3747.31,"location":2,"content":"performance of every model so even"},{"from":3747.31,"to":3750.61,"location":2,"content":"for just the naivebayes models"},{"from":3750.61,"to":3754.03,"location":2,"content":"performances going up about 4% because"},{"from":3754.03,"to":3757.6,"location":2,"content":"of the fact that it now knows more about"},{"from":3757.6,"to":3760.33,"location":2,"content":"which particular words are positive or"},{"from":3760.33,"to":3764.08,"location":2,"content":"negative in the sentences but still none"},{"from":3764.08,"to":3766.65,"location":2,"content":"of these performances are really great"},{"from":3766.65,"to":3769.84,"location":2,"content":"so we still thought that well can we"},{"from":3769.84,"to":3773.68,"location":2,"content":"build better models of how to do this in"},{"from":3773.68,"to":3776.32,"location":2,"content":"particular if you look at sentences with"},{"from":3776.32,"to":3779.23,"location":2,"content":"sort of various kinds of negation you"},{"from":3779.23,"to":3780.67,"location":2,"content":"know things like should have been"},{"from":3780.67,"to":3783.22,"location":2,"content":"funnier these models in general still"},{"from":3783.22,"to":3785.47,"location":2,"content":"couldn't capture the right meanings for"},{"from":3785.47,"to":3788.38,"location":2,"content":"them and so that led into our fourth"},{"from":3788.38,"to":3792.04,"location":2,"content":"model of how to do this which is this"},{"from":3792.04,"to":3795.48,"location":2,"content":"idea of recursive neural tenser networks"},{"from":3795.48,"to":3799.3,"location":2,"content":"and so what we wanted to be able to do"},{"from":3799.3,"to":3803.7,"location":2,"content":"is go back to just having meanings of"},{"from":3803.7,"to":3807.4,"location":2,"content":"words be vectors but nevertheless"},{"from":3807.4,"to":3810.58,"location":2,"content":"despite that to be able to have a"},{"from":3810.58,"to":3812.65,"location":2,"content":"meaning for a phrase where the two"},{"from":3812.65,"to":3817.63,"location":2,"content":"vectors acted on each other and well you"},{"from":3817.63,"to":3820.36,"location":2,"content":"know this kind of this is the picture of"},{"from":3820.36,"to":3822.85,"location":2,"content":"what we did when we were doing attention"},{"from":3822.85,"to":3825.34,"location":2,"content":"in a bilinear way right we had vectors"},{"from":3825.34,"to":3827.89,"location":2,"content":"for two words we stuck a matrix in"},{"from":3827.89,"to":3831.79,"location":2,"content":"between and we use that and gave an"},{"from":3831.79,"to":3834.37,"location":2,"content":"attention and got an attention score out"},{"from":3834.37,"to":3838.12,"location":2,"content":"so that let these two vectors interact"},{"from":3838.12,"to":3840.82,"location":2,"content":"with each other but it only produced one"},{"from":3840.82,"to":3843.31,"location":2,"content":"number as the output but there's a way"},{"from":3843.31,"to":3846.73,"location":2,"content":"to fix that which is to say well rather"},{"from":3846.73,"to":3851.02,"location":2,"content":"than having matrix here what we could"},{"from":3851.02,"to":3853.86,"location":2,"content":"stick here is a three dimensional cube"},{"from":3853.86,"to":3856.15,"location":2,"content":"which physicists and deep learning"},{"from":3856.15,"to":3859.27,"location":2,"content":"people caught now call a tensor right so"},{"from":3859.27,"to":3860.47,"location":2,"content":"a tensor is just higher"},{"from":3860.47,"to":3863.08,"location":2,"content":"multi-dimensional array in computer"},{"from":3863.08,"to":3867.19,"location":2,"content":"science terms so if we sort of made that"},{"from":3867.19,"to":3870.43,"location":2,"content":"a tensor you know it's like we have sort"},{"from":3870.43,"to":3873.01,"location":2,"content":"of multiple layers and matrix here and"},{"from":3873.01,"to":3876.22,"location":2,"content":"so the end result of that is we get one"},{"from":3876.22,"to":3878.7,"location":2,"content":"number here and one number here"},{"from":3878.7,"to":3881.88,"location":2,"content":"so in total we get out of size two"},{"from":3881.88,"to":3885.24,"location":2,"content":"vector which is all we need and my baby"},{"from":3885.24,"to":3887.82,"location":2,"content":"example where we baby examples where we"},{"from":3887.82,"to":3889.41,"location":2,"content":"only have these two component vectors"},{"from":3889.41,"to":3891.21,"location":2,"content":"for words but in general we have a"},{"from":3891.21,"to":3894.06,"location":2,"content":"tensor with the extra mention dimension"},{"from":3894.06,"to":3896.4,"location":2,"content":"of the size of our word vector and so"},{"from":3896.4,"to":3898.32,"location":2,"content":"therefore we'll get a word vector at"},{"from":3898.32,"to":3900.69,"location":2,"content":"work we'll get a phrase vector out from"},{"from":3900.69,"to":3903.42,"location":2,"content":"the composition that's the same size of"},{"from":3903.42,"to":3906.5,"location":2,"content":"the input vectors and will allow them to"},{"from":3906.5,"to":3909.69,"location":2,"content":"interact with each other in working out"},{"from":3909.69,"to":3916.28,"location":2,"content":"the meaning of the entire thing okay"},{"from":3916.28,"to":3920.55,"location":2,"content":"right at that point we use the resulting"},{"from":3920.55,"to":3927.78,"location":2,"content":"vectors so we had our new tensor network"},{"from":3927.78,"to":3930.09,"location":2,"content":"we actually combined it together with"},{"from":3930.09,"to":3932.46,"location":2,"content":"this sort of previous kind of layer we"},{"from":3932.46,"to":3936.15,"location":2,"content":"used to have our first R and then maybe"},{"from":3936.15,"to":3937.5,"location":2,"content":"you didn't need to do this but we just"},{"from":3937.5,"to":3939.99,"location":2,"content":"decided we're betting as well put things"},{"from":3939.99,"to":3942.18,"location":2,"content":"through a non-linearity and that was"},{"from":3942.18,"to":3944.76,"location":2,"content":"then giving us our new representation of"},{"from":3944.76,"to":3947.61,"location":2,"content":"Fraser's we built that up the tree and"},{"from":3947.61,"to":3950.34,"location":2,"content":"then at the end we could classify the"},{"from":3950.34,"to":3954.27,"location":2,"content":"meaning of any phrase in the same kind"},{"from":3954.27,"to":3956.7,"location":2,"content":"of way with softmax regression and we"},{"from":3956.7,"to":3958.41,"location":2,"content":"could train these weights with gradient"},{"from":3958.41,"to":3961.17,"location":2,"content":"descent to predict sentiment and so this"},{"from":3961.17,"to":3964.29,"location":2,"content":"actually worked pretty nicely I'm in"},{"from":3964.29,"to":3966.57,"location":2,"content":"particular it didn't show really work"},{"from":3966.57,"to":3969.63,"location":2,"content":"any better with just the sentence labels"},{"from":3969.63,"to":3972.72,"location":2,"content":"but if we train the model with our tree"},{"from":3972.72,"to":3974.75,"location":2,"content":"bank we could then get a kind of"},{"from":3974.75,"to":3977.22,"location":2,"content":"whatever that is about another couple of"},{"from":3977.22,"to":3979.14,"location":2,"content":"percent and performance and so that"},{"from":3979.14,"to":3981.9,"location":2,"content":"seemed good and so in particular it"},{"from":3981.9,"to":3984,"location":2,"content":"seemed to do a much better job of"},{"from":3984,"to":3985.62,"location":2,"content":"actually understanding meaning"},{"from":3985.62,"to":3988.17,"location":2,"content":"composition so here's the kind of"},{"from":3988.17,"to":3990.93,"location":2,"content":"sentence where you have there a slow and"},{"from":3990.93,"to":3993,"location":2,"content":"repetitive parts but it has just enough"},{"from":3993,"to":3995.52,"location":2,"content":"spice to keep it interesting and the"},{"from":3995.52,"to":3997.47,"location":2,"content":"models seem you know pretty good at"},{"from":3997.47,"to":3999.72,"location":2,"content":"understanding ok this part of the"},{"from":3999.72,"to":4002.36,"location":2,"content":"sentence is negative this part of the"},{"from":4002.36,"to":4004.46,"location":2,"content":"sentence is positive and actually when"},{"from":4004.46,"to":4006.5,"location":2,"content":"you stick the two halves together the"},{"from":4006.5,"to":4008.54,"location":2,"content":"end result is the sentence there's"},{"from":4008.54,"to":4010.53,"location":2,"content":"positive and meaning"},{"from":4010.53,"to":4013.65,"location":2,"content":"but focusing in a little bit more what"},{"from":4013.65,"to":4016.92,"location":2,"content":"seems like it's especially good was for"},{"from":4016.92,"to":4019.14,"location":2,"content":"the first time this actually did seem"},{"from":4019.14,"to":4022.86,"location":2,"content":"like it could do a better job of working"},{"from":4022.86,"to":4025.53,"location":2,"content":"out sort of what happens when you do"},{"from":4025.53,"to":4028.17,"location":2,"content":"things like negation so here we have"},{"from":4028.17,"to":4030.51,"location":2,"content":"it's just incredibly dull and it's"},{"from":4030.51,"to":4032.55,"location":2,"content":"definitely not dull so if it's"},{"from":4032.55,"to":4034.86,"location":2,"content":"definitely not dull that actually means"},{"from":4034.86,"to":4037.86,"location":2,"content":"it's good right can we work out that the"},{"from":4037.86,"to":4041.24,"location":2,"content":"meaning of it's definitely not dull and"},{"from":4041.24,"to":4046.8,"location":2,"content":"so these this is sort of showing a sort"},{"from":4046.8,"to":4050.94,"location":2,"content":"of what happens when you have a negative"},{"from":4050.94,"to":4053.61,"location":2,"content":"and negative sentence that's further"},{"from":4053.61,"to":4061.41,"location":2,"content":"negated so if you go from so if you sort"},{"from":4061.41,"to":4064.77,"location":2,"content":"of do and the in X and the Gatien of a"},{"from":4064.77,"to":4067.53,"location":2,"content":"negative things should become moderately"},{"from":4067.53,"to":4071.25,"location":2,"content":"positive right so that if you have dull"},{"from":4071.25,"to":4074.19,"location":2,"content":"is negative and if you say not dull it"},{"from":4074.19,"to":4076.53,"location":2,"content":"doesn't mean it's fantastic but it means"},{"from":4076.53,"to":4079.2,"location":2,"content":"it's moderately positive and so for"},{"from":4079.2,"to":4082.38,"location":2,"content":"either a kind of naive Bayes model or"},{"from":4082.38,"to":4084.54,"location":2,"content":"our preceding models they weren't"},{"from":4084.54,"to":4086.61,"location":2,"content":"capable of capturing that of sort of"},{"from":4086.61,"to":4090.08,"location":2,"content":"going from dull to not dull your your"},{"from":4090.08,"to":4092.91,"location":2,"content":"meaning computation did not come out any"},{"from":4092.91,"to":4095.49,"location":2,"content":"more positive whereas this sort of neuro"},{"from":4095.49,"to":4098.31,"location":2,"content":"tenser network was capturing the fact"},{"from":4098.31,"to":4100.92,"location":2,"content":"that not dull meant it was reasonably"},{"from":4100.92,"to":4107.31,"location":2,"content":"good so that was progress yeah so I"},{"from":4107.31,"to":4109.83,"location":2,"content":"think that's as much as I'll show you"},{"from":4109.83,"to":4111.84,"location":2,"content":"really now about applying these tree"},{"from":4111.84,"to":4115.68,"location":2,"content":"structured neural networks to natural"},{"from":4115.68,"to":4120.3,"location":2,"content":"language you know I think the summary I"},{"from":4120.3,"to":4123.06,"location":2,"content":"sort of said at the beginning is that I"},{"from":4123.06,"to":4124.35,"location":2,"content":"think you know they're kind of"},{"from":4124.35,"to":4126.51,"location":2,"content":"interesting ideas and linguistic"},{"from":4126.51,"to":4131.16,"location":2,"content":"connections here I mean for various"},{"from":4131.16,"to":4132.18,"location":2,"content":"reasons"},{"from":4132.18,"to":4136.44,"location":2,"content":"these ideas haven't been pursued a ton"},{"from":4136.44,"to":4138.45,"location":2,"content":"in recent years of natural language"},{"from":4138.45,"to":4141.69,"location":2,"content":"processing you know one is in all"},{"from":4141.69,"to":4144.19,"location":2,"content":"honesty people have found that"},{"from":4144.19,"to":4147.61,"location":2,"content":"once you have high dimensional vectors"},{"from":4147.61,"to":4149.95,"location":2,"content":"in things like the kind of sequence"},{"from":4149.95,"to":4151.81,"location":2,"content":"models that we've looked at whether it's"},{"from":4151.81,"to":4153.67,"location":2,"content":"meaning things like the sort of lsdm"},{"from":4153.67,"to":4156.4,"location":2,"content":"models or any of the more recent"},{"from":4156.4,"to":4158.92,"location":2,"content":"contextual language models those work"},{"from":4158.92,"to":4162.07,"location":2,"content":"incredibly well and it's not it's not"},{"from":4162.07,"to":4163.72,"location":2,"content":"clear that over all these models work"},{"from":4163.72,"to":4166.6,"location":2,"content":"better the second reason is sort of a"},{"from":4166.6,"to":4171.91,"location":2,"content":"computational reason which is GPUs work"},{"from":4171.91,"to":4174.13,"location":2,"content":"great when you're doing uniform"},{"from":4174.13,"to":4176.74,"location":2,"content":"computation and the beauty of having"},{"from":4176.74,"to":4178.66,"location":2,"content":"something like a sequence model is that"},{"from":4178.66,"to":4182.02,"location":2,"content":"there's there's just one determinate"},{"from":4182.02,"to":4184,"location":2,"content":"computation you're doing along the"},{"from":4184,"to":4186.55,"location":2,"content":"sequence or in the convolutional neural"},{"from":4186.55,"to":4188.91,"location":2,"content":"network there's one determinate"},{"from":4188.91,"to":4191.83,"location":2,"content":"computation you're doing up through your"},{"from":4191.83,"to":4194.08,"location":2,"content":"convolutional layers and therefore"},{"from":4194.08,"to":4196.96,"location":2,"content":"things can be represented and computed"},{"from":4196.96,"to":4199.72,"location":2,"content":"efficiently on a GPU the huge problem"},{"from":4199.72,"to":4201.88,"location":2,"content":"with these kind of models was what"},{"from":4201.88,"to":4203.59,"location":2,"content":"computations you were going to do"},{"from":4203.59,"to":4205.75,"location":2,"content":"depended on which structure you're"},{"from":4205.75,"to":4208.03,"location":2,"content":"assigning to the sentence and every"},{"from":4208.03,"to":4210.04,"location":2,"content":"sentence was going to have a different"},{"from":4210.04,"to":4212.35,"location":2,"content":"structure and so therefore there was no"},{"from":4212.35,"to":4214.75,"location":2,"content":"way to batch the computations over a"},{"from":4214.75,"to":4217.27,"location":2,"content":"group of sentences and have the same"},{"from":4217.27,"to":4219.19,"location":2,"content":"computations being done for different"},{"from":4219.19,"to":4221.74,"location":2,"content":"sentences which sort of undermined the"},{"from":4221.74,"to":4223.57,"location":2,"content":"ability to sort of efficiently build"},{"from":4223.57,"to":4227.32,"location":2,"content":"these models in the large the thing I"},{"from":4227.32,"to":4229.51,"location":2,"content":"thought I'd just sort of say a moment"},{"from":4229.51,"to":4232.99,"location":2,"content":"about at the end the funny thing is that"},{"from":4232.99,"to":4235.36,"location":2,"content":"although these haven't been used much"},{"from":4235.36,"to":4238.98,"location":2,"content":"for language in the last few years that"},{"from":4238.98,"to":4242.2,"location":2,"content":"they've actually had some use and found"},{"from":4242.2,"to":4244.45,"location":2,"content":"different applications in different"},{"from":4244.45,"to":4247.39,"location":2,"content":"places which is just sort of seen kind"},{"from":4247.39,"to":4249.91,"location":2,"content":"of cute so this is actually an"},{"from":4249.91,"to":4253.96,"location":2,"content":"application from physics and I think I'm"},{"from":4253.96,"to":4256.21,"location":2,"content":"going to just have to read this since I"},{"from":4256.21,"to":4258.45,"location":2,"content":"have no idea what half the words mean"},{"from":4258.45,"to":4262.06,"location":2,"content":"but what it says is by far the most"},{"from":4262.06,"to":4264.25,"location":2,"content":"common structures seen in collisions at"},{"from":4264.25,"to":4266.77,"location":2,"content":"the Large Hadron Collider a collimated"},{"from":4266.77,"to":4269.32,"location":2,"content":"sprays of energetic hydrants referred to"},{"from":4269.32,"to":4271.78,"location":2,"content":"as Jets these Jets are produced from the"},{"from":4271.78,"to":4273.85,"location":2,"content":"fragmentation and hadron ization of"},{"from":4273.85,"to":4276.52,"location":2,"content":"quarks and gluons as described by"},{"from":4276.52,"to":4277.6,"location":2,"content":"quantum chromo"},{"from":4277.6,"to":4281.59,"location":2,"content":"dynamics anyone knows what that means I"},{"from":4281.59,"to":4284.35,"location":2,"content":"hope you're following along here one"},{"from":4284.35,"to":4286.42,"location":2,"content":"compelling physics challenge is to"},{"from":4286.42,"to":4288.79,"location":2,"content":"search for highly boosted standard model"},{"from":4288.79,"to":4291.96,"location":2,"content":"particles decaying hydraulically"},{"from":4291.96,"to":4294.01,"location":2,"content":"unfortunately there's a large background"},{"from":4294.01,"to":4297.82,"location":2,"content":"from jets produced by more mundane QCD"},{"from":4297.82,"to":4299.71,"location":2,"content":"that's quantum chromodynamics that"},{"from":4299.71,"to":4302.62,"location":2,"content":"processes in this work we propose"},{"from":4302.62,"to":4304.89,"location":2,"content":"instead a solution for jet"},{"from":4304.89,"to":4307.36,"location":2,"content":"classification based on an analogy"},{"from":4307.36,"to":4310.18,"location":2,"content":"between quantum chromodynamics and"},{"from":4310.18,"to":4312.88,"location":2,"content":"natural languages as inspired by several"},{"from":4312.88,"to":4316.11,"location":2,"content":"works from natural language processing"},{"from":4316.11,"to":4319.03,"location":2,"content":"much like a sentence is composed of"},{"from":4319.03,"to":4321.43,"location":2,"content":"words for syntactic structure organized"},{"from":4321.43,"to":4323.77,"location":2,"content":"as the past tree and Jettas are also"},{"from":4323.77,"to":4326.62,"location":2,"content":"composed of four momenta following a"},{"from":4326.62,"to":4328.9,"location":2,"content":"structure dictated by key CD and"},{"from":4328.9,"to":4331.03,"location":2,"content":"organized by the clustering history of a"},{"from":4331.03,"to":4334.66,"location":2,"content":"sequential combination jet algorithm so"},{"from":4334.66,"to":4337.45,"location":2,"content":"anyway um yeah with these jets you see"},{"from":4337.45,"to":4339.88,"location":2,"content":"they've got a tree structure over them"},{"from":4339.88,"to":4342.73,"location":2,"content":"and they're using a tree recursive"},{"from":4342.73,"to":4346.78,"location":2,"content":"neural network to model it well that's a"},{"from":4346.78,"to":4348.76,"location":2,"content":"little bit far afield but to show you"},{"from":4348.76,"to":4353.05,"location":2,"content":"just one more example that another place"},{"from":4353.05,"to":4354.76,"location":2,"content":"where these models have actually been"},{"from":4354.76,"to":4358.03,"location":2,"content":"quite useful is for doing things in"},{"from":4358.03,"to":4360.91,"location":2,"content":"programming languages and I think in"},{"from":4360.91,"to":4363.79,"location":2,"content":"part this is because the application is"},{"from":4363.79,"to":4366.79,"location":2,"content":"easier in programming languages so"},{"from":4366.79,"to":4369.22,"location":2,"content":"unlike a natural language where we have"},{"from":4369.22,"to":4371.23,"location":2,"content":"this uncertainty as to what is the"},{"from":4371.23,"to":4373.6,"location":2,"content":"correct parse tree because there's a lot"},{"from":4373.6,"to":4376.02,"location":2,"content":"of ambiguity in natural language in"},{"from":4376.02,"to":4379.33,"location":2,"content":"programming languages the parse trees"},{"from":4379.33,"to":4381.67,"location":2,"content":"are actually pretty determinate so a"},{"from":4381.67,"to":4384.43,"location":2,"content":"group of people at Berkeley dorm song"},{"from":4384.43,"to":4387.49,"location":2,"content":"and her students have worked on doing"},{"from":4387.49,"to":4390.16,"location":2,"content":"programming language translation by"},{"from":4390.16,"to":4392.38,"location":2,"content":"building tree recursive neural network"},{"from":4392.38,"to":4396.31,"location":2,"content":"encoder decoders so that you're building"},{"from":4396.31,"to":4398.04,"location":2,"content":"up a tree structured neural network"},{"from":4398.04,"to":4401.44,"location":2,"content":"representation of a program in one"},{"from":4401.44,"to":4403.66,"location":2,"content":"language this is a CoffeeScript program"},{"from":4403.66,"to":4406.12,"location":2,"content":"and then you're wanting to build a tree"},{"from":4406.12,"to":4408.7,"location":2,"content":"a tree model which is then translating"},{"from":4408.7,"to":4410.74,"location":2,"content":"that to a program in a different"},{"from":4410.74,"to":4411.52,"location":2,"content":"language"},{"from":4411.52,"to":4413.95,"location":2,"content":"and they've been able to do that and get"},{"from":4413.95,"to":4417.88,"location":2,"content":"good results I was too lazy to retype"},{"from":4417.88,"to":4420.82,"location":2,"content":"this table so this is probably a bit bit"},{"from":4420.82,"to":4423.46,"location":2,"content":"hard to read but what's contrasting is"},{"from":4423.46,"to":4426.25,"location":2,"content":"for a number of programs this is a sort"},{"from":4426.25,"to":4430.26,"location":2,"content":"of CoffeeScript to JavaScript"},{"from":4430.26,"to":4433.42,"location":2,"content":"translation they're comparing using tree"},{"from":4433.42,"to":4436.39,"location":2,"content":"to tree models and then using sequence"},{"from":4436.39,"to":4438.25,"location":2,"content":"to sequence models and then they try"},{"from":4438.25,"to":4441.1,"location":2,"content":"both other combinations sequence to tree"},{"from":4441.1,"to":4444.91,"location":2,"content":"and tree to sequence and what they find"},{"from":4444.91,"to":4447.34,"location":2,"content":"is you can get the best results with the"},{"from":4447.34,"to":4450.13,"location":2,"content":"tree to tree neural network models and"},{"from":4450.13,"to":4452.92,"location":2,"content":"in particular these tree to tree models"},{"from":4452.92,"to":4455.59,"location":2,"content":"are augmented with attention so they"},{"from":4455.59,"to":4457.24,"location":2,"content":"have attention like we talked about the"},{"from":4457.24,"to":4459.64,"location":2,"content":"sequence to sequence models where you're"},{"from":4459.64,"to":4461.26,"location":2,"content":"then being able to do attention back to"},{"from":4461.26,"to":4463.75,"location":2,"content":"nodes in the tree structure which is a"},{"from":4463.75,"to":4466.66,"location":2,"content":"pretty natural way of doing translation"},{"from":4466.66,"to":4469.48,"location":2,"content":"and indeed what these results show is if"},{"from":4469.48,"to":4471.31,"location":2,"content":"you don't have I'll sorry these results"},{"from":4471.31,"to":4473.17,"location":2,"content":"show is if you don't have the attention"},{"from":4473.17,"to":4476.2,"location":2,"content":"operation it doesn't work at all it's"},{"from":4476.2,"to":4479.77,"location":2,"content":"too difficult to get things so done if"},{"from":4479.77,"to":4481,"location":2,"content":"you've just sort of trying to create a"},{"from":4481,"to":4482.98,"location":2,"content":"single tree representation then say"},{"from":4482.98,"to":4485.65,"location":2,"content":"generate that the translation from that"},{"from":4485.65,"to":4487.09,"location":2,"content":"but if you can do it with this sort of"},{"from":4487.09,"to":4489.1,"location":2,"content":"putting attention into the different"},{"from":4489.1,"to":4493.57,"location":2,"content":"nodes that's great you might if you know"},{"from":4493.57,"to":4495.7,"location":2,"content":"about what CoffeeScript is you might"},{"from":4495.7,"to":4498.31,"location":2,"content":"feel like wait that's cheating slightly"},{"from":4498.31,"to":4500.32,"location":2,"content":"because copy script is a bit too similar"},{"from":4500.32,"to":4504.07,"location":2,"content":"to JavaScript but they've also done it"},{"from":4504.07,"to":4506.05,"location":2,"content":"in other languages so this is going"},{"from":4506.05,"to":4510.16,"location":2,"content":"between Java and c-sharp and this is a"},{"from":4510.16,"to":4513.13,"location":2,"content":"sort of a handwritten Java to c-sharp"},{"from":4513.13,"to":4515.05,"location":2,"content":"converter that you can download from"},{"from":4515.05,"to":4517.09,"location":2,"content":"github if you want but it doesn't"},{"from":4517.09,"to":4520,"location":2,"content":"actually work that well and they're able"},{"from":4520,"to":4521.83,"location":2,"content":"to show that they're able to build a far"},{"from":4521.83,"to":4527.14,"location":2,"content":"better Java to c-sharp translator doing"},{"from":4527.14,"to":4530.47,"location":2,"content":"that so that's actually kind of cool and"},{"from":4530.47,"to":4531.73,"location":2,"content":"it's good to know the tree structure"},{"from":4531.73,"to":4533.83,"location":2,"content":"recursive neural networks are good for"},{"from":4533.83,"to":4536.74,"location":2,"content":"some things so I'm pleased oh I see work"},{"from":4536.74,"to":4540.28,"location":2,"content":"like this okay I'm just about done but I"},{"from":4540.28,"to":4543.91,"location":2,"content":"thought before finishing I just"},{"from":4543.91,"to":4544.87,"location":2,"content":"mentioned one of"},{"from":4544.87,"to":4547.06,"location":2,"content":"thing which is sort of nothing to do"},{"from":4547.06,"to":4548.25,"location":2,"content":"with natural language processing"},{"from":4548.25,"to":4551.95,"location":2,"content":"precisely but it's about AI but I wanted"},{"from":4551.95,"to":4553.57,"location":2,"content":"to sort of put in a little bit of"},{"from":4553.57,"to":4556.24,"location":2,"content":"advertisement that something that a"},{"from":4556.24,"to":4558.22,"location":2,"content":"number of us have been working on very"},{"from":4558.22,"to":4560.95,"location":2,"content":"hard for the last year or so is"},{"from":4560.95,"to":4564.25,"location":2,"content":"developing a new Stanford Institute for"},{"from":4564.25,"to":4566.2,"location":2,"content":"human centered artificial intelligence"},{"from":4566.2,"to":4568.3,"location":2,"content":"and actually the launch of this"},{"from":4568.3,"to":4570.58,"location":2,"content":"Institute is going to be on Monday of"},{"from":4570.58,"to":4573.07,"location":2,"content":"exam week just when you're maximally"},{"from":4573.07,"to":4575.5,"location":2,"content":"concentrating oh and the things such as"},{"from":4575.5,"to":4579.19,"location":2,"content":"this but our hope is that we can have a"},{"from":4579.19,"to":4582.43,"location":2,"content":"lot of new activity around artificial"},{"from":4582.43,"to":4584.62,"location":2,"content":"intelligence taking a much broader"},{"from":4584.62,"to":4586.86,"location":2,"content":"perspective to artificial intelligence"},{"from":4586.86,"to":4590.05,"location":2,"content":"which is centrally viewing it from the"},{"from":4590.05,"to":4594.84,"location":2,"content":"viewpoint of humans and working out"},{"from":4594.84,"to":4597.22,"location":2,"content":"exploring a large broader range of"},{"from":4597.22,"to":4599.02,"location":2,"content":"issues that embrace a lot of the"},{"from":4599.02,"to":4601.06,"location":2,"content":"interests of the rest of the university"},{"from":4601.06,"to":4602.98,"location":2,"content":"where there's the social sciences and"},{"from":4602.98,"to":4605.2,"location":2,"content":"humanities are also various in"},{"from":4605.2,"to":4607.12,"location":2,"content":"professional schools like the law school"},{"from":4607.12,"to":4609.76,"location":2,"content":"and the business school so to just"},{"from":4609.76,"to":4614.92,"location":2,"content":"quickly say a minute about that that the"},{"from":4614.92,"to":4617.5,"location":2,"content":"the so motivating idea is that sort of"},{"from":4617.5,"to":4620.65,"location":2,"content":"for most of my life sort of AI seem like"},{"from":4620.65,"to":4623.74,"location":2,"content":"a kind of a fun intellectual quest as to"},{"from":4623.74,"to":4625,"location":2,"content":"whether you could write bits of software"},{"from":4625,"to":4628.03,"location":2,"content":"that did anything halfway intelligent"},{"from":4628.03,"to":4629.74,"location":2,"content":"but that's clearly not what's going to"},{"from":4629.74,"to":4632.11,"location":2,"content":"be what's happening for the next 25"},{"from":4632.11,"to":4634.24,"location":2,"content":"years that we're now at this point in"},{"from":4634.24,"to":4636.43,"location":2,"content":"which artificial intelligence systems"},{"from":4636.43,"to":4640.06,"location":2,"content":"are being unleashed on society"},{"from":4640.06,"to":4642.58,"location":2,"content":"and well hopefully they do some good"},{"from":4642.58,"to":4644.53,"location":2,"content":"things but as we've increasingly been"},{"from":4644.53,"to":4646.84,"location":2,"content":"seeing there are lots of also lots of"},{"from":4646.84,"to":4648.82,"location":2,"content":"opportunities for them to do bad things"},{"from":4648.82,"to":4650.74,"location":2,"content":"and even if we're not imagining"},{"from":4650.74,"to":4653.23,"location":2,"content":"Terminator scenarios there are just lots"},{"from":4653.23,"to":4655.84,"location":2,"content":"of places where people are using machine"},{"from":4655.84,"to":4658.66,"location":2,"content":"learning and AI algorithms for making"},{"from":4658.66,"to":4660.7,"location":2,"content":"decisions some of the worst ones or"},{"from":4660.7,"to":4662.53,"location":2,"content":"things like sentencing guidelines and"},{"from":4662.53,"to":4664.45,"location":2,"content":"courts where you have very biased"},{"from":4664.45,"to":4666.64,"location":2,"content":"algorithms making bad decisions and"},{"from":4666.64,"to":4668.8,"location":2,"content":"people are only starting to become a lot"},{"from":4668.8,"to":4671.35,"location":2,"content":"more aware of the issues and so"},{"from":4671.35,"to":4672.88,"location":2,"content":"effectively we're wanting to have this"},{"from":4672.88,"to":4675.28,"location":2,"content":"Institutes of embracing a lot of the"},{"from":4675.28,"to":4677.68,"location":2,"content":"work of social scientists ethicists and"},{"from":4677.68,"to":4678.64,"location":2,"content":"other people"},{"from":4678.64,"to":4680.98,"location":2,"content":"to actually explore how to have an AI"},{"from":4680.98,"to":4683.92,"location":2,"content":"that's really improving human lives"},{"from":4683.92,"to":4685.6,"location":2,"content":"rather than having the opposite effect"},{"from":4685.6,"to":4689.16,"location":2,"content":"and so the three themes that we're"},{"from":4689.16,"to":4692.35,"location":2,"content":"mainly emphasizing for this Institute is"},{"from":4692.35,"to":4695.49,"location":2,"content":"the first one in the top left is"},{"from":4695.49,"to":4698.38,"location":2,"content":"developing AI technologies but we're"},{"from":4698.38,"to":4700.36,"location":2,"content":"particularly interested in making"},{"from":4700.36,"to":4702.88,"location":2,"content":"linkages back to human intelligence so"},{"from":4702.88,"to":4705.79,"location":2,"content":"cognitive science and neuroscience that"},{"from":4705.79,"to":4707.59,"location":2,"content":"when a lot of the early formative work"},{"from":4707.59,"to":4711.49,"location":2,"content":"in AI was done including all of the"},{"from":4711.49,"to":4713.44,"location":2,"content":"early work in neural networks like the"},{"from":4713.44,"to":4715.6,"location":2,"content":"development of backpropagation it was"},{"from":4715.6,"to":4717.37,"location":2,"content":"actually largely done in the context of"},{"from":4717.37,"to":4718.96,"location":2,"content":"cognitive science right and that was a"},{"from":4718.96,"to":4721.3,"location":2,"content":"sort of a linkage that tended to get"},{"from":4721.3,"to":4725.29,"location":2,"content":"lost in the 90s and 2000's statistical"},{"from":4725.29,"to":4727.45,"location":2,"content":"machine learning emphasis and I think"},{"from":4727.45,"to":4730.41,"location":2,"content":"it'd be good to renew that the top right"},{"from":4730.41,"to":4733.78,"location":2,"content":"there's paying much more attention to"},{"from":4733.78,"to":4736.36,"location":2,"content":"the human and societal impact of AI and"},{"from":4736.36,"to":4738.61,"location":2,"content":"so this is looking at legal issues"},{"from":4738.61,"to":4742.56,"location":2,"content":"economic issues labor forces ethics"},{"from":4742.56,"to":4745.42,"location":2,"content":"great power politics whatever you are"},{"from":4745.42,"to":4748.42,"location":2,"content":"but then down the bottom is something"},{"from":4748.42,"to":4750.31,"location":2,"content":"where it seems like this just kind of"},{"from":4750.31,"to":4752.95,"location":2,"content":"enormous opportunities to do more which"},{"from":4752.95,"to":4756.22,"location":2,"content":"is how can we build technology that"},{"from":4756.22,"to":4759.22,"location":2,"content":"actually augments human lives like to"},{"from":4759.22,"to":4762.21,"location":2,"content":"some extent hero tech we've got"},{"from":4762.21,"to":4765.28,"location":2,"content":"technology with AI augmenting human"},{"from":4765.28,"to":4767.53,"location":2,"content":"lives so all of your cell phones have"},{"from":4767.53,"to":4769.78,"location":2,"content":"speech recognition in them now so you"},{"from":4769.78,"to":4773.02,"location":2,"content":"know that's AI that can augment your"},{"from":4773.02,"to":4775.06,"location":2,"content":"human lives but there's a sense of which"},{"from":4775.06,"to":4778.54,"location":2,"content":"not very much of artificial intelligence"},{"from":4778.54,"to":4780.76,"location":2,"content":"has actually been put into the service"},{"from":4780.76,"to":4783.67,"location":2,"content":"of augmenting human lives like most of"},{"from":4783.67,"to":4785.68,"location":2,"content":"what a cell phone has on it is still"},{"from":4785.68,"to":4787.99,"location":2,"content":"sort of clever and cute stuff done by"},{"from":4787.99,"to":4790.54,"location":2,"content":"HCI people and designers which is very"},{"from":4790.54,"to":4792.52,"location":2,"content":"nice a lot of the time when you're using"},{"from":4792.52,"to":4794.71,"location":2,"content":"your map program or something but we"},{"from":4794.71,"to":4797.71,"location":2,"content":"don't really have much AI inside these"},{"from":4797.71,"to":4799.57,"location":2,"content":"devices helping to make people's lives"},{"from":4799.57,"to":4802.72,"location":2,"content":"better and so we're hoping not only for"},{"from":4802.72,"to":4804.94,"location":2,"content":"individuals but applications like health"},{"from":4804.94,"to":4807.85,"location":2,"content":"care to be doing much more sort of"},{"from":4807.85,"to":4810.1,"location":2,"content":"putting artificial intelligence into"},{"from":4810.1,"to":4812.35,"location":2,"content":"human centered applications"},{"from":4812.35,"to":4814.88,"location":2,"content":"anyway that's my brief advertisement I'm"},{"from":4814.88,"to":4816.29,"location":2,"content":"look out for this while you're not"},{"from":4816.29,"to":4818.27,"location":2,"content":"studying for your exams and I think"},{"from":4818.27,"to":4819.14,"location":2,"content":"there'll be sort of lots of"},{"from":4819.14,"to":4821.75,"location":2,"content":"opportunities for students and others to"},{"from":4821.75,"to":4823.34,"location":2,"content":"be getting more involved in this in the"},{"from":4823.34,"to":4826.85,"location":2,"content":"coming months okay thank you very much"},{"from":4826.85,"to":4829.08,"location":2,"content":"and I will see you later"},{"from":4829.08,"to":4833.76,"location":2,"content":"[Applause]"}]} \ No newline at end of file diff --git a/bcc-en/19.bcc b/bcc-en/19.bcc new file mode 100644 index 0000000000000000000000000000000000000000..4fddc338f31835c395eeb15d9e8e81ef06f023ec --- /dev/null +++ b/bcc-en/19.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.72,"to":8.01,"location":2,"content":"okay hi everyone let's get started so"},{"from":8.01,"to":10.36,"location":2,"content":"Chris is traveling this week so he's not"},{"from":10.36,"to":12.43,"location":2,"content":"here but I'm very excited to say that"},{"from":12.43,"to":14.56,"location":2,"content":"today we've got Margaret Mitchell who is"},{"from":14.56,"to":16.96,"location":2,"content":"a senior research scientist at Google AI"},{"from":16.96,"to":19.72,"location":2,"content":"and she's gonna tell us about the latest"},{"from":19.72,"to":22.57,"location":2,"content":"work defining and understanding and"},{"from":22.57,"to":25.18,"location":2,"content":"improving the situation with bias in"},{"from":25.18,"to":28.12,"location":2,"content":"artificial intelligence Margaret has a"},{"from":28.12,"to":29.86,"location":2,"content":"background working in NLP and deep"},{"from":29.86,"to":31.3,"location":2,"content":"learning so I'm really interested to"},{"from":31.3,"to":33.07,"location":2,"content":"hear what she has to say today take it"},{"from":33.07,"to":33.4,"location":2,"content":"away"},{"from":33.4,"to":36.04,"location":2,"content":"great thank you and can you guys hear me"},{"from":36.04,"to":38.14,"location":2,"content":"okay I'm not sure if this mic is exactly"},{"from":38.14,"to":40.66,"location":2,"content":"picking up my bus everything's cool okay"},{"from":40.66,"to":45.34,"location":2,"content":"cool um so this work is the product of a"},{"from":45.34,"to":46.24,"location":2,"content":"ton of different people and"},{"from":46.24,"to":47.74,"location":2,"content":"collaborators that I've tried to put up"},{"from":47.74,"to":50.62,"location":2,"content":"here some students at Stanford also"},{"from":50.62,"to":54.3,"location":2,"content":"Johns Hopkins Google Facebook and"},{"from":54.3,"to":61.17,"location":2,"content":"Microsoft are all represented cool so um"},{"from":61.17,"to":64.06,"location":2,"content":"for those of you who haven't seen this"},{"from":64.06,"to":67.15,"location":2,"content":"set of slides before what do you see"},{"from":67.15,"to":71.26,"location":2,"content":"here just shout it up bananas okay what"},{"from":71.26,"to":72.81,"location":2,"content":"else"},{"from":72.81,"to":94.12,"location":2,"content":"stickers what else bananas with stickers"},{"from":94.12,"to":95.59,"location":2,"content":"on them you can start doing like"},{"from":95.59,"to":97.81,"location":2,"content":"embedded clauses you know bunches of"},{"from":97.81,"to":99.46,"location":2,"content":"bananas with stickers on them on shelves"},{"from":99.46,"to":101.71,"location":2,"content":"in the store to get kind of crazy but we"},{"from":101.71,"to":104.35,"location":2,"content":"don't tend to say yellow bananas right"},{"from":104.35,"to":107.26,"location":2,"content":"so give them something like this we"},{"from":107.26,"to":109.63,"location":2,"content":"might say green bananas or we might say"},{"from":109.63,"to":112.9,"location":2,"content":"unripe bananas given an image like this"},{"from":112.9,"to":115.66,"location":2,"content":"we might say ripe bananas or bananas"},{"from":115.66,"to":118.69,"location":2,"content":"with spots on them if you're me you"},{"from":118.69,"to":119.74,"location":2,"content":"might say bananas that are good for"},{"from":119.74,"to":122.98,"location":2,"content":"banana bread and but given an image like"},{"from":122.98,"to":124.66,"location":2,"content":"this or something like this in the real"},{"from":124.66,"to":126.82,"location":2,"content":"world we tend not to mention the"},{"from":126.82,"to":129.13,"location":2,"content":"yellowness and the reason for this is"},{"from":129.13,"to":131.62,"location":2,"content":"because yellow is prototypical for"},{"from":131.62,"to":135.82,"location":2,"content":"bananas so the idea of prototypes stems"},{"from":135.82,"to":137.62,"location":2,"content":"from prototype theory which goes back to"},{"from":137.62,"to":138.46,"location":2,"content":"the early"},{"from":138.46,"to":140.74,"location":2,"content":"coming out of the work of Eleanor Rosch"},{"from":140.74,"to":143.44,"location":2,"content":"and colleagues and it's this idea that"},{"from":143.44,"to":145.83,"location":2,"content":"there are some stored central"},{"from":145.83,"to":149.5,"location":2,"content":"prototypical notions of objects that we"},{"from":149.5,"to":152.35,"location":2,"content":"access as we're operating throughout the"},{"from":152.35,"to":154.6,"location":2,"content":"world there's some disagreement about"},{"from":154.6,"to":157.65,"location":2,"content":"whether these prototypes are actual"},{"from":157.65,"to":160.09,"location":2,"content":"exemplars of objects or something like a"},{"from":160.09,"to":162.52,"location":2,"content":"distribution over what's likely but"},{"from":162.52,"to":164.29,"location":2,"content":"there is general agreement that we do"},{"from":164.29,"to":166.21,"location":2,"content":"have some sort of sense of what's"},{"from":166.21,"to":168.91,"location":2,"content":"typical and what's a typical of the"},{"from":168.91,"to":170.8,"location":2,"content":"things in the world and we tend to"},{"from":170.8,"to":172.66,"location":2,"content":"notice and talk about the things that"},{"from":172.66,"to":179.2,"location":2,"content":"are atypical so this is a riddle that I"},{"from":179.2,"to":181,"location":2,"content":"heard in middle school that worked a"},{"from":181,"to":183.46,"location":2,"content":"little bit more at that time some of you"},{"from":183.46,"to":185.68,"location":2,"content":"might have heard it before a man and his"},{"from":185.68,"to":187.72,"location":2,"content":"son are in a terrible accident and are"},{"from":187.72,"to":189.4,"location":2,"content":"rushed to the hospital in critical care"},{"from":189.4,"to":191.98,"location":2,"content":"the doctor looks at the boy and exclaims"},{"from":191.98,"to":194.44,"location":2,"content":"I can't operate on this boy he's my son"},{"from":194.44,"to":200.68,"location":2,"content":"how could this be - Deb's or more he has"},{"from":200.68,"to":203.65,"location":2,"content":"them on her doctor right otherwise known"},{"from":203.65,"to":206.53,"location":2,"content":"as a female doctor which might be"},{"from":206.53,"to":211.09,"location":2,"content":"contract is contrasted with doctor in a"},{"from":211.09,"to":213.07,"location":2,"content":"study they did when they first sort of"},{"from":213.07,"to":215.62,"location":2,"content":"put forward this riddle at Boston"},{"from":215.62,"to":217.18,"location":2,"content":"University they found that the majority"},{"from":217.18,"to":219.1,"location":2,"content":"of test subjects overlooked the"},{"from":219.1,"to":220.87,"location":2,"content":"possibility that the doctor could be a"},{"from":220.87,"to":223.66,"location":2,"content":"she and that included men women and"},{"from":223.66,"to":226.48,"location":2,"content":"self-described feminists so the point is"},{"from":226.48,"to":230.35,"location":2,"content":"that these kinds of ways of talking"},{"from":230.35,"to":232,"location":2,"content":"about things and assumptions that we"},{"from":232,"to":234.67,"location":2,"content":"make aren't necessarily something that"},{"from":234.67,"to":238.6,"location":2,"content":"speaks to negative intent but something"},{"from":238.6,"to":240.48,"location":2,"content":"that speaks to how we actually store"},{"from":240.48,"to":242.79,"location":2,"content":"representations in our minds and how we"},{"from":242.79,"to":245.08,"location":2,"content":"access those representations as we"},{"from":245.08,"to":248.92,"location":2,"content":"interact in the world so this this"},{"from":248.92,"to":250.69,"location":2,"content":"affects what we can learn when we're"},{"from":250.69,"to":254.29,"location":2,"content":"learning from text so this is work from"},{"from":254.29,"to":257.05,"location":2,"content":"2013 where they took a look at what was"},{"from":257.05,"to":258.73,"location":2,"content":"sort of most likely what would you learn"},{"from":258.73,"to":261.3,"location":2,"content":"if you were just learning from raw text"},{"from":261.3,"to":263.47,"location":2,"content":"what were some things that were common"},{"from":263.47,"to":267.34,"location":2,"content":"in the world um and they found that in"},{"from":267.34,"to":269.62,"location":2,"content":"this set up something like murdering was"},{"from":269.62,"to":270.86,"location":2,"content":"ten times"},{"from":270.86,"to":273.35,"location":2,"content":"likely than blinking and the reason for"},{"from":273.35,"to":275.06,"location":2,"content":"this is because people tend not to"},{"from":275.06,"to":277.34,"location":2,"content":"mention these typical things that go"},{"from":277.34,"to":280.07,"location":2,"content":"without saying we don't tend to mention"},{"from":280.07,"to":282.77,"location":2,"content":"things like blinking and breathing but"},{"from":282.77,"to":284.96,"location":2,"content":"we do mention atypical events like"},{"from":284.96,"to":287.21,"location":2,"content":"murder and that affects the kind of"},{"from":287.21,"to":289.46,"location":2,"content":"things a machine can learn from text"},{"from":289.46,"to":291.23,"location":2,"content":"that we put out in the world because"},{"from":291.23,"to":292.61,"location":2,"content":"it's been subject to all of these"},{"from":292.61,"to":294.44,"location":2,"content":"filtering processes that we have as"},{"from":294.44,"to":298.82,"location":2,"content":"humans before we communicate this issue"},{"from":298.82,"to":300.71,"location":2,"content":"in particular is known as human"},{"from":300.71,"to":302.36,"location":2,"content":"reporting bias which is that the"},{"from":302.36,"to":304.25,"location":2,"content":"frequency with which people write about"},{"from":304.25,"to":307.07,"location":2,"content":"actions outcomes or properties is not a"},{"from":307.07,"to":309.26,"location":2,"content":"reflection of real-world frequencies or"},{"from":309.26,"to":310.82,"location":2,"content":"the degree to which a property is"},{"from":310.82,"to":312.41,"location":2,"content":"characteristic of a class of individuals"},{"from":312.41,"to":314.6,"location":2,"content":"but says a lot more about how we're"},{"from":314.6,"to":316.22,"location":2,"content":"actually processing the world and what"},{"from":316.22,"to":320.06,"location":2,"content":"we think is remarkable so this affects"},{"from":320.06,"to":322.88,"location":2,"content":"everything a system can learn in a"},{"from":322.88,"to":324.89,"location":2,"content":"typical machine learning paradigm one of"},{"from":324.89,"to":326.48,"location":2,"content":"the first steps is to collect and"},{"from":326.48,"to":330.23,"location":2,"content":"potentially annotate training data from"},{"from":330.23,"to":334.73,"location":2,"content":"there a model can be trained from there"},{"from":334.73,"to":337.49,"location":2,"content":"media can be filtered ranked ranked"},{"from":337.49,"to":340.52,"location":2,"content":"aggregated generated in some way and"},{"from":340.52,"to":342.98,"location":2,"content":"from there people see the output and we"},{"from":342.98,"to":345.11,"location":2,"content":"like to think of this as a relatively"},{"from":345.11,"to":347.66,"location":2,"content":"straightforward pipeline but at the very"},{"from":347.66,"to":350.6,"location":2,"content":"start even before we're collecting with"},{"from":350.6,"to":352.49,"location":2,"content":"the data actually within the data itself"},{"from":352.49,"to":355.07,"location":2,"content":"are a host of different kinds of human"},{"from":355.07,"to":357.53,"location":2,"content":"biases so things like stereotyping"},{"from":357.53,"to":359,"location":2,"content":"things like prejudice things like a"},{"from":359,"to":361.34,"location":2,"content":"racism and that's embedded within the"},{"from":361.34,"to":363.92,"location":2,"content":"data before we collect it then as we"},{"from":363.92,"to":366.53,"location":2,"content":"collect and annotate data further biases"},{"from":366.53,"to":368.45,"location":2,"content":"become introduced so things like"},{"from":368.45,"to":371.68,"location":2,"content":"sampling errors confirmation bias"},{"from":371.68,"to":374.12,"location":2,"content":"in-group bias and out-group bias and"},{"from":374.12,"to":376.04,"location":2,"content":"I'll talk about these a little bit oh"},{"from":376.04,"to":378.02,"location":2,"content":"and I should mention feel free to ask"},{"from":378.02,"to":381.26,"location":2,"content":"questions as I go totally fine to just"},{"from":381.26,"to":385.34,"location":2,"content":"kind of interact throughout so here are"},{"from":385.34,"to":386.89,"location":2,"content":"some of the biases that I think are"},{"from":386.89,"to":389.48,"location":2,"content":"relatively important for work in AI"},{"from":389.48,"to":391.58,"location":2,"content":"machine learning there's hundreds you"},{"from":391.58,"to":393.89,"location":2,"content":"can go into but some of the ones that"},{"from":393.89,"to":395.51,"location":2,"content":"I've sort of become the most aware of"},{"from":395.51,"to":398,"location":2,"content":"working in this space are this set and"},{"from":398,"to":400.4,"location":2,"content":"I'll go through each of these a bit so I"},{"from":400.4,"to":401.99,"location":2,"content":"talked about reporting bias earlier"},{"from":401.99,"to":404.18,"location":2,"content":"which is which affects what we can learn"},{"from":404.18,"to":406.74,"location":2,"content":"from"},{"from":406.74,"to":409.36,"location":2,"content":"another example of a kind of bias that"},{"from":409.36,"to":410.59,"location":2,"content":"really affects what we can learn from"},{"from":410.59,"to":414.22,"location":2,"content":"text is selection bias so a lot of times"},{"from":414.22,"to":415.96,"location":2,"content":"that we a lot of times when we get data"},{"from":415.96,"to":418.57,"location":2,"content":"annotated we do something like Amazon's"},{"from":418.57,"to":421.06,"location":2,"content":"Mechanical Turk and the distribution of"},{"from":421.06,"to":423.25,"location":2,"content":"workers across the world is not an even"},{"from":423.25,"to":425.47,"location":2,"content":"sort of uniform distribution it's"},{"from":425.47,"to":428.65,"location":2,"content":"actually concentrated in India the US"},{"from":428.65,"to":430.45,"location":2,"content":"and then some in Europe so this leaves"},{"from":430.45,"to":432.94,"location":2,"content":"out South America this leaves out Africa"},{"from":432.94,"to":435.25,"location":2,"content":"this leaves out a lot of China and that"},{"from":435.25,"to":436.75,"location":2,"content":"affects the kind of things that we'll be"},{"from":436.75,"to":438.34,"location":2,"content":"able to learn about the world when we"},{"from":438.34,"to":443.41,"location":2,"content":"have things annotated another kind of"},{"from":443.41,"to":445.39,"location":2,"content":"bias is out-group homogeneity bias which"},{"from":445.39,"to":447.19,"location":2,"content":"is the tendency to see out group members"},{"from":447.19,"to":449.32,"location":2,"content":"as more alike than in-group members and"},{"from":449.32,"to":451.39,"location":2,"content":"this is going to affect what people are"},{"from":451.39,"to":453.28,"location":2,"content":"able to describe and talk about when"},{"from":453.28,"to":454.63,"location":2,"content":"they're annotating things such as"},{"from":454.63,"to":457.72,"location":2,"content":"emotion so so for example we have these"},{"from":457.72,"to":460.3,"location":2,"content":"two like adorable puppies on the left"},{"from":460.3,"to":461.62,"location":2,"content":"here and they're looking at these four"},{"from":461.62,"to":464.23,"location":2,"content":"cats and these are all different black"},{"from":464.23,"to":465.73,"location":2,"content":"cats very different in different ways"},{"from":465.73,"to":468.07,"location":2,"content":"but the two puppies look at the cats and"},{"from":468.07,"to":470.44,"location":2,"content":"they see four cats basically the same"},{"from":470.44,"to":472.51,"location":2,"content":"and it's kind of trivial to understand"},{"from":472.51,"to":474.34,"location":2,"content":"how that also extends to human cognition"},{"from":474.34,"to":477.79,"location":2,"content":"and how we also process people it's this"},{"from":477.79,"to":480.15,"location":2,"content":"it's the sense we have that the the"},{"from":480.15,"to":482.56,"location":2,"content":"cohort that we're in the people that we"},{"from":482.56,"to":484.54,"location":2,"content":"interact with those are the kinds of"},{"from":484.54,"to":486.73,"location":2,"content":"people that are nuanced and everybody"},{"from":486.73,"to":489.43,"location":2,"content":"else is somehow less nuanced has less"},{"from":489.43,"to":491.71,"location":2,"content":"detail to them it's a trick our minds"},{"from":491.71,"to":493.72,"location":2,"content":"play on us in order to help us process"},{"from":493.72,"to":495.94,"location":2,"content":"the world but it affects how we talk"},{"from":495.94,"to":497.53,"location":2,"content":"about it and it affects further how we"},{"from":497.53,"to":502.18,"location":2,"content":"annotate it this leads to stuff like"},{"from":502.18,"to":504.55,"location":2,"content":"bias data representations so it's"},{"from":504.55,"to":506.08,"location":2,"content":"possible that you have an appropriate"},{"from":506.08,"to":508.63,"location":2,"content":"amount of data for every possible human"},{"from":508.63,"to":511.42,"location":2,"content":"group you can think of in your data but"},{"from":511.42,"to":512.86,"location":2,"content":"it might be the case that some groups"},{"from":512.86,"to":514.54,"location":2,"content":"are represented less positively than"},{"from":514.54,"to":516.49,"location":2,"content":"others and if we have time I'll go into"},{"from":516.49,"to":521.29,"location":2,"content":"a long a longer example of that it also"},{"from":521.29,"to":523.47,"location":2,"content":"leads to things like biased labels so"},{"from":523.47,"to":526.06,"location":2,"content":"this is a issue that came up when we"},{"from":526.06,"to":527.62,"location":2,"content":"were getting some annotations for"},{"from":527.62,"to":530.14,"location":2,"content":"inclusive images competition asking"},{"from":530.14,"to":532.57,"location":2,"content":"people to annotate things like bride and"},{"from":532.57,"to":535.45,"location":2,"content":"wedding and groom and we found that"},{"from":535.45,"to":537.01,"location":2,"content":"given three different kinds of bride"},{"from":537.01,"to":537.48,"location":2,"content":"wedding"},{"from":537.48,"to":541.17,"location":2,"content":"room images ones that were more Western"},{"from":541.17,"to":544.95,"location":2,"content":"European American got the appropriate"},{"from":544.95,"to":547.38,"location":2,"content":"labels and ones that weren't just got"},{"from":547.38,"to":549.63,"location":2,"content":"sort of more generic person kinds of"},{"from":549.63,"to":552.75,"location":2,"content":"labels not able to actually tease out"},{"from":552.75,"to":554.25,"location":2,"content":"what's actually happening in these"},{"from":554.25,"to":560.25,"location":2,"content":"images companies issue our biases in"},{"from":560.25,"to":562.59,"location":2,"content":"interpretation when the model outputs"},{"from":562.59,"to":566.55,"location":2,"content":"its decisions so one one issue is"},{"from":566.55,"to":568.44,"location":2,"content":"confirmation bias which is the tendency"},{"from":568.44,"to":571.17,"location":2,"content":"to search for interpret favor recall"},{"from":571.17,"to":572.67,"location":2,"content":"information in a way that confirms"},{"from":572.67,"to":574.83,"location":2,"content":"pre-existing beliefs and so a lot of"},{"from":574.83,"to":577.86,"location":2,"content":"times when we build and to end systems"},{"from":577.86,"to":580.32,"location":2,"content":"and try and test our hypotheses were"},{"from":580.32,"to":582.54,"location":2,"content":"kind of just testing it towards things"},{"from":582.54,"to":584.79,"location":2,"content":"that we want to be true and analyzing"},{"from":584.79,"to":586.44,"location":2,"content":"the results in a way that will help"},{"from":586.44,"to":590.34,"location":2,"content":"confirm what we want to be true over"},{"from":590.34,"to":592.14,"location":2,"content":"generalization which is coming to a"},{"from":592.14,"to":593.67,"location":2,"content":"conclusion based on information that's"},{"from":593.67,"to":596.37,"location":2,"content":"too general or not specific enough this"},{"from":596.37,"to":597.96,"location":2,"content":"is an issue that happens a lot of times"},{"from":597.96,"to":600.69,"location":2,"content":"in the analysis of deep learning model"},{"from":600.69,"to":603.51,"location":2,"content":"results where it's assumed that there's"},{"from":603.51,"to":605.94,"location":2,"content":"there's some kind of general conclusion"},{"from":605.94,"to":607.62,"location":2,"content":"that can be taken away when really it's"},{"from":607.62,"to":609.84,"location":2,"content":"actually just an effect of really skewed"},{"from":609.84,"to":612.57,"location":2,"content":"data this is also closely related to"},{"from":612.57,"to":614.73,"location":2,"content":"overfitting which is kind of the machine"},{"from":614.73,"to":616.59,"location":2,"content":"learning version of over generalization"},{"from":616.59,"to":618.72,"location":2,"content":"which is where you're still making"},{"from":618.72,"to":620.55,"location":2,"content":"predictions and outcomes but it's based"},{"from":620.55,"to":624.09,"location":2,"content":"on a small set of possible features so"},{"from":624.09,"to":626.34,"location":2,"content":"it's not actually capturing the space of"},{"from":626.34,"to":629.22,"location":2,"content":"the correct features for the outcome the"},{"from":629.22,"to":632.75,"location":2,"content":"desired output prediction correctly"},{"from":632.75,"to":635.46,"location":2,"content":"there's also correlation fallacy which"},{"from":635.46,"to":637.14,"location":2,"content":"is confusing correlation with causation"},{"from":637.14,"to":639.75,"location":2,"content":"and this happens a lot again in talking"},{"from":639.75,"to":641.43,"location":2,"content":"about what machine learning models are"},{"from":641.43,"to":642.9,"location":2,"content":"learning and deep learning models are"},{"from":642.9,"to":645,"location":2,"content":"learning in particular where just"},{"from":645,"to":647.46,"location":2,"content":"because things happen together doesn't"},{"from":647.46,"to":649.41,"location":2,"content":"mean that one is causing the other but"},{"from":649.41,"to":651.72,"location":2,"content":"models don't tell you anything deep"},{"from":651.72,"to":653.13,"location":2,"content":"learning models directly don't tell you"},{"from":653.13,"to":655.11,"location":2,"content":"anything about the causal relations and"},{"from":655.11,"to":657.15,"location":2,"content":"so it's easy to think that some output"},{"from":657.15,"to":659.1,"location":2,"content":"that is predicted based on a correlation"},{"from":659.1,"to":661.08,"location":2,"content":"is actually something that's causal and"},{"from":661.08,"to":662.91,"location":2,"content":"I'll talk about some examples of this"},{"from":662.91,"to":663.39,"location":2,"content":"too"},{"from":663.39,"to":667.23,"location":2,"content":"a further issue is automation bias and"},{"from":667.23,"to":669.15,"location":2,"content":"this really affects the machine learning"},{"from":669.15,"to":671.1,"location":2,"content":"models we put out there in the world"},{"from":671.1,"to":673.59,"location":2,"content":"get used by people in systems like"},{"from":673.59,"to":676.38,"location":2,"content":"justice systems so that's the tendency"},{"from":676.38,"to":680.4,"location":2,"content":"to favor the suggestions of automatic"},{"from":680.4,"to":682.41,"location":2,"content":"predictions of models that output"},{"from":682.41,"to":686.94,"location":2,"content":"predictions over the over the different"},{"from":686.94,"to":689.31,"location":2,"content":"kinds of suggestions of another human"},{"from":689.31,"to":691.47,"location":2,"content":"and this happens even in the face of"},{"from":691.47,"to":694.2,"location":2,"content":"contradictory evidence so if a system is"},{"from":694.2,"to":696.66,"location":2,"content":"telling you you know this this is the"},{"from":696.66,"to":699.81,"location":2,"content":"score or this is the risk of this"},{"from":699.81,"to":701.76,"location":2,"content":"individual then we're more likely to"},{"from":701.76,"to":703.95,"location":2,"content":"think it's true because it came out of a"},{"from":703.95,"to":706.86,"location":2,"content":"mathematical system and we automatically"},{"from":706.86,"to":708.15,"location":2,"content":"sort of see this as something more"},{"from":708.15,"to":709.98,"location":2,"content":"objective something more mathematical"},{"from":709.98,"to":711.87,"location":2,"content":"that something's going to be more true"},{"from":711.87,"to":714.42,"location":2,"content":"than human some somehow and that's"},{"from":714.42,"to":718.41,"location":2,"content":"automation bias so rather than this kind"},{"from":718.41,"to":720.93,"location":2,"content":"of clean straightforward pipeline that"},{"from":720.93,"to":723.03,"location":2,"content":"we have in machine learning and we have"},{"from":723.03,"to":724.68,"location":2,"content":"human bias coming in at the very start"},{"from":724.68,"to":728.43,"location":2,"content":"in the data and then human bias coming"},{"from":728.43,"to":730.5,"location":2,"content":"in in data collection annotation and"},{"from":730.5,"to":732.3,"location":2,"content":"then further getting propagated through"},{"from":732.3,"to":734.85,"location":2,"content":"the system as we train on that data as"},{"from":734.85,"to":737.01,"location":2,"content":"we start putting outputs based on that"},{"from":737.01,"to":739.5,"location":2,"content":"data as people act on that data and this"},{"from":739.5,"to":742.62,"location":2,"content":"creates a feedback loop where the kinds"},{"from":742.62,"to":744.57,"location":2,"content":"of things that we output for people to"},{"from":744.57,"to":748.29,"location":2,"content":"act on are then are then then serves as"},{"from":748.29,"to":750.69,"location":2,"content":"further training data for input for new"},{"from":750.69,"to":753.12,"location":2,"content":"system so you end up amplifying even"},{"from":753.12,"to":754.71,"location":2,"content":"further these different kinds of"},{"from":754.71,"to":758.16,"location":2,"content":"implicit biases this is known as a bias"},{"from":758.16,"to":760.71,"location":2,"content":"Network effect or bias Laundering I like"},{"from":760.71,"to":765.12,"location":2,"content":"to call it and so the message is that"},{"from":765.12,"to":767.49,"location":2,"content":"human data perpetuates human biases and"},{"from":767.49,"to":769.38,"location":2,"content":"then as as machine learning or deep"},{"from":769.38,"to":771.48,"location":2,"content":"learning learns from human data the"},{"from":771.48,"to":775.08,"location":2,"content":"result is a bias network effect so I"},{"from":775.08,"to":777.51,"location":2,"content":"want to steer clear the idea that if I"},{"from":777.51,"to":780.06,"location":2,"content":"say bias or if someone says bias that"},{"from":780.06,"to":782.01,"location":2,"content":"equals bad it's a little bit more"},{"from":782.01,"to":784.68,"location":2,"content":"nuanced than that so there are all kinds"},{"from":784.68,"to":786.42,"location":2,"content":"of things that people mean when they're"},{"from":786.42,"to":788.91,"location":2,"content":"talking about bias and even the same"},{"from":788.91,"to":790.95,"location":2,"content":"bias can be good in some situations and"},{"from":790.95,"to":793.38,"location":2,"content":"bad in some situations so bias in"},{"from":793.38,"to":795.66,"location":2,"content":"statistics on ml we could we talked"},{"from":795.66,"to":797.37,"location":2,"content":"about the bias of an estimator which is"},{"from":797.37,"to":798.78,"location":2,"content":"the difference between the predictions"},{"from":798.78,"to":800.67,"location":2,"content":"and the and the truth for ground truth"},{"from":800.67,"to":803.32,"location":2,"content":"we talked about the bias term in linear"},{"from":803.32,"to":806.71,"location":2,"content":"rushon we also have cognitive biases and"},{"from":806.71,"to":808.27,"location":2,"content":"I talked about that in the beginning and"},{"from":808.27,"to":810.61,"location":2,"content":"not all of those are negative or or have"},{"from":810.61,"to":813.52,"location":2,"content":"to be or have to be seen as negative so"},{"from":813.52,"to":816.16,"location":2,"content":"optimism is another kind of bias that we"},{"from":816.16,"to":818.05,"location":2,"content":"can have that affects our worldview and"},{"from":818.05,"to":820.18,"location":2,"content":"the way we sort of process things and"},{"from":820.18,"to":822.01,"location":2,"content":"even things like recency bias and"},{"from":822.01,"to":824.29,"location":2,"content":"confirmation bias are just ways that our"},{"from":824.29,"to":827.92,"location":2,"content":"minds can like handle the combinatorial"},{"from":827.92,"to":829.6,"location":2,"content":"explosion of all the different things"},{"from":829.6,"to":831.49,"location":2,"content":"that can be true in the world and put it"},{"from":831.49,"to":832.93,"location":2,"content":"down to something tractable that we can"},{"from":832.93,"to":834.55,"location":2,"content":"sort of operate with in the real world"},{"from":834.55,"to":838.33,"location":2,"content":"and so algorithmic bias is what a lot of"},{"from":838.33,"to":840.25,"location":2,"content":"people mean and headlines and whatnot"},{"from":840.25,"to":842.17,"location":2,"content":"when they're talking about bias which is"},{"from":842.17,"to":845.11,"location":2,"content":"more about unjust unfair or prejudicial"},{"from":845.11,"to":847.24,"location":2,"content":"treatment of people that's an output of"},{"from":847.24,"to":850.24,"location":2,"content":"an automated decision system and the"},{"from":850.24,"to":853.75,"location":2,"content":"focus here is really on unjust unfair or"},{"from":853.75,"to":855.85,"location":2,"content":"prejudicial treatment of people so a lot"},{"from":855.85,"to":857.32,"location":2,"content":"of the work in this space right now is"},{"from":857.32,"to":859.57,"location":2,"content":"focusing on trying to understand what"},{"from":859.57,"to":861.67,"location":2,"content":"does it mean to be unjust from an"},{"from":861.67,"to":864.1,"location":2,"content":"algorithm what does it mean to be unfair"},{"from":864.1,"to":866.92,"location":2,"content":"from an algorithm and how can we handle"},{"from":866.92,"to":868.36,"location":2,"content":"this how can we sort of mitigate these"},{"from":868.36,"to":870.4,"location":2,"content":"issues in order to be able to keep"},{"from":870.4,"to":872.05,"location":2,"content":"developing technology that's useful for"},{"from":872.05,"to":876.78,"location":2,"content":"people without worsening social divides"},{"from":876.78,"to":879.22,"location":2,"content":"and I thought the Guardian put it really"},{"from":879.22,"to":882.16,"location":2,"content":"well a few years ago they said although"},{"from":882.16,"to":883.93,"location":2,"content":"neural networks might be said to write"},{"from":883.93,"to":886.33,"location":2,"content":"their own programs they do so towards"},{"from":886.33,"to":888.79,"location":2,"content":"goals set by humans using data collected"},{"from":888.79,"to":891.07,"location":2,"content":"for human purposes if the data is skewed"},{"from":891.07,"to":893.17,"location":2,"content":"even by accident the computers will"},{"from":893.17,"to":895.93,"location":2,"content":"amplify injustice and it really keyed in"},{"from":895.93,"to":899.02,"location":2,"content":"on this amplify and justice idea and"},{"from":899.02,"to":902.62,"location":2,"content":"let's talk about what that can mean so"},{"from":902.62,"to":904.57,"location":2,"content":"one of the avenues of deep learning"},{"from":904.57,"to":906.55,"location":2,"content":"research that's taken off in the past"},{"from":906.55,"to":908.23,"location":2,"content":"few years is predicting criminal"},{"from":908.23,"to":912.37,"location":2,"content":"behavior so how many of you are familiar"},{"from":912.37,"to":917.29,"location":2,"content":"with predictive policing okay like half"},{"from":917.29,"to":918.43,"location":2,"content":"of the class okay"},{"from":918.43,"to":921.85,"location":2,"content":"so in predictive policing algorithms are"},{"from":921.85,"to":925.09,"location":2,"content":"predict areas to deploy officers where"},{"from":925.09,"to":927.76,"location":2,"content":"crime is considered to be likely to"},{"from":927.76,"to":932.74,"location":2,"content":"occur but the data that the the models"},{"from":932.74,"to":935.41,"location":2,"content":"are trained off of is based on where"},{"from":935.41,"to":936.2,"location":2,"content":"police off"},{"from":936.2,"to":938.69,"location":2,"content":"SURS have already gone and made arrests"},{"from":938.69,"to":941.27,"location":2,"content":"so the systems are simply learning the"},{"from":941.27,"to":943.4,"location":2,"content":"patterns of bias that humans have in"},{"from":943.4,"to":945.35,"location":2,"content":"where they go and where they are trying"},{"from":945.35,"to":948.92,"location":2,"content":"to decide to defer to find crime and"},{"from":948.92,"to":951.32,"location":2,"content":"then reflecting them back so because the"},{"from":951.32,"to":953.69,"location":2,"content":"system hones in on some of the top spots"},{"from":953.69,"to":956.6,"location":2,"content":"where people have been arrested notice"},{"from":956.6,"to":958.22,"location":2,"content":"that's not the same of nuts the same"},{"from":958.22,"to":959.36,"location":2,"content":"thing as where crimes have been"},{"from":959.36,"to":961.61,"location":2,"content":"committed right it's where arrests have"},{"from":961.61,"to":964.52,"location":2,"content":"been made it means that the other areas"},{"from":964.52,"to":966.17,"location":2,"content":"that might be explored for crime don't"},{"from":966.17,"to":968,"location":2,"content":"get explored at all that worsens the"},{"from":968,"to":971.54,"location":2,"content":"situation some neighborhoods get really"},{"from":971.54,"to":974.06,"location":2,"content":"acutely focused attention on them and"},{"from":974.06,"to":975.86,"location":2,"content":"that heightens the chances of serious"},{"from":975.86,"to":978.05,"location":2,"content":"repercussions for even minor infractions"},{"from":978.05,"to":980.57,"location":2,"content":"that means arrests and that means a"},{"from":980.57,"to":982.4,"location":2,"content":"feedback loop of data that you will get"},{"from":982.4,"to":986.86,"location":2,"content":"an arrest in this place if you go there"},{"from":986.86,"to":990.65,"location":2,"content":"another sort of related issue in this"},{"from":990.65,"to":994.43,"location":2,"content":"space is predictive sentencing so there"},{"from":994.43,"to":996.02,"location":2,"content":"was a really nice article that came out"},{"from":996.02,"to":998.18,"location":2,"content":"from Pro Publica a few years ago"},{"from":998.18,"to":1001.09,"location":2,"content":"discussing this but when most defendants"},{"from":1001.09,"to":1002.68,"location":2,"content":"are booked in jail they respond to a"},{"from":1002.68,"to":1005.26,"location":2,"content":"questionnaire called compass and their"},{"from":1005.26,"to":1007.06,"location":2,"content":"answers are fed into this software"},{"from":1007.06,"to":1009.25,"location":2,"content":"system that generates scores that"},{"from":1009.25,"to":1011.2,"location":2,"content":"correspond to the risk of recidivism"},{"from":1011.2,"to":1015.09,"location":2,"content":"that's the risk of making a crime again"},{"from":1015.09,"to":1017.32,"location":2,"content":"and the questions are used to gather"},{"from":1017.32,"to":1019.26,"location":2,"content":"data on the defendants socioeconomic"},{"from":1019.26,"to":1022.42,"location":2,"content":"status family background neighborhood"},{"from":1022.42,"to":1024.55,"location":2,"content":"crime employment status and other"},{"from":1024.55,"to":1025.75,"location":2,"content":"factors in order to reach some"},{"from":1025.75,"to":1029.14,"location":2,"content":"predictive prediction of an individual's"},{"from":1029.14,"to":1033.61,"location":2,"content":"crime or criminal risk but once ends up"},{"from":1033.61,"to":1035.47,"location":2,"content":"happening is that it ends up focusing on"},{"from":1035.47,"to":1038.77,"location":2,"content":"the key bias issues that humans have and"},{"from":1038.77,"to":1041.35,"location":2,"content":"propagating it back with something that"},{"from":1041.35,"to":1044.05,"location":2,"content":"looks like an objective score so you're"},{"from":1044.05,"to":1047.17,"location":2,"content":"a lot more likely to be convicted of a"},{"from":1047.17,"to":1049.42,"location":2,"content":"crime if you're black than if you're"},{"from":1049.42,"to":1051.28,"location":2,"content":"white even if you've made the exact same"},{"from":1051.28,"to":1053.77,"location":2,"content":"crime and the system will pick up on"},{"from":1053.77,"to":1056.14,"location":2,"content":"this and will reflect this back to say"},{"from":1056.14,"to":1057.43,"location":2,"content":"that people who are black are more"},{"from":1057.43,"to":1059.38,"location":2,"content":"likely to have received like recidivism"},{"from":1059.38,"to":1061.63,"location":2,"content":"more likely to convict him to make a"},{"from":1061.63,"to":1066.34,"location":2,"content":"crime again so this is an example of"},{"from":1066.34,"to":1068.8,"location":2,"content":"automation bias preferring the output of"},{"from":1068.8,"to":1069.95,"location":2,"content":"a system"},{"from":1069.95,"to":1072.58,"location":2,"content":"in the face of overgeneralization"},{"from":1072.58,"to":1075.58,"location":2,"content":"feedback loops and correlation fallacy"},{"from":1075.58,"to":1077.57,"location":2,"content":"confusing things that are occurring"},{"from":1077.57,"to":1083.66,"location":2,"content":"together as being somehow causal there's"},{"from":1083.66,"to":1086.53,"location":2,"content":"another sort of area of research and"},{"from":1086.53,"to":1088.76,"location":2,"content":"startups looking at predicting"},{"from":1088.76,"to":1090.77,"location":2,"content":"criminality in particular some things"},{"from":1090.77,"to":1092.93,"location":2,"content":"like face images so there's a company"},{"from":1092.93,"to":1095.06,"location":2,"content":"out there called face ception they're"},{"from":1095.06,"to":1097.46,"location":2,"content":"based in Israel and they claim to be"},{"from":1097.46,"to":1101.93,"location":2,"content":"able to use individual images with"},{"from":1101.93,"to":1103.16,"location":2,"content":"computer vision and machine learning"},{"from":1103.16,"to":1105.26,"location":2,"content":"technology for profiling people and"},{"from":1105.26,"to":1107.39,"location":2,"content":"revealing their personality based only"},{"from":1107.39,"to":1110.96,"location":2,"content":"on their facial image recognizing things"},{"from":1110.96,"to":1113.93,"location":2,"content":"like high IQ white-collar offender had a"},{"from":1113.93,"to":1116.48,"location":2,"content":"file and terrorists and their main"},{"from":1116.48,"to":1118.73,"location":2,"content":"clients are homeland security lots of"},{"from":1118.73,"to":1120.95,"location":2,"content":"other lots of other countries dealing"},{"from":1120.95,"to":1122.42,"location":2,"content":"with sort of public safety issues"},{"from":1122.42,"to":1124.88,"location":2,"content":"they've not published any details about"},{"from":1124.88,"to":1126.83,"location":2,"content":"their methods their sources of training"},{"from":1126.83,"to":1129.32,"location":2,"content":"data or their quantitative results we"},{"from":1129.32,"to":1130.91,"location":2,"content":"know that in light of automation bias"},{"from":1130.91,"to":1132.89,"location":2,"content":"people will tend to think it just works"},{"from":1132.89,"to":1135.71,"location":2,"content":"even when it doesn't work well but there"},{"from":1135.71,"to":1137.75,"location":2,"content":"was a paper that came out within a"},{"from":1137.75,"to":1140.53,"location":2,"content":"similar line in predicting criminal"},{"from":1140.53,"to":1143.09,"location":2,"content":"criminality or purporting to predict"},{"from":1143.09,"to":1145.49,"location":2,"content":"criminality from individual face images"},{"from":1145.49,"to":1149.06,"location":2,"content":"and that one had some results and some"},{"from":1149.06,"to":1150.65,"location":2,"content":"more details about the data that we"},{"from":1150.65,"to":1152.3,"location":2,"content":"could kind of dig into to understand"},{"from":1152.3,"to":1154.49,"location":2,"content":"where are these kinds of claims coming"},{"from":1154.49,"to":1156.71,"location":2,"content":"from so this was an article that was"},{"from":1156.71,"to":1159.52,"location":2,"content":"posted on archive near the end of 2016"},{"from":1159.52,"to":1162.32,"location":2,"content":"and they said they were using less than"},{"from":1162.32,"to":1165.52,"location":2,"content":"2000 closely cropped images of faces"},{"from":1165.52,"to":1168.71,"location":2,"content":"including wanted suspect ID pictures"},{"from":1168.71,"to":1171.14,"location":2,"content":"from specific regions and they claimed"},{"from":1171.14,"to":1172.84,"location":2,"content":"that even based on this very small"},{"from":1172.84,"to":1176.27,"location":2,"content":"training data set that they were able to"},{"from":1176.27,"to":1178.49,"location":2,"content":"predict whether or not someone was"},{"from":1178.49,"to":1181.46,"location":2,"content":"likely to be a criminal greater than 90"},{"from":1181.46,"to":1182.36,"location":2,"content":"percent accuracy"},{"from":1182.36,"to":1185.51,"location":2,"content":"um and they got so lost in this this"},{"from":1185.51,"to":1188.72,"location":2,"content":"idea that it's sort of funny to read to"},{"from":1188.72,"to":1190.4,"location":2,"content":"just take a step back and realize what's"},{"from":1190.4,"to":1192.86,"location":2,"content":"actually happening so for example one of"},{"from":1192.86,"to":1195.47,"location":2,"content":"their really great exciting claims was"},{"from":1195.47,"to":1197.84,"location":2,"content":"that the angle theta from nose tip 2 to"},{"from":1197.84,"to":1200.15,"location":2,"content":"mouth corners is on average nineteen"},{"from":1200.15,"to":1202.49,"location":2,"content":"point six percent smaller for criminals"},{"from":1202.49,"to":1204.89,"location":2,"content":"for non-criminals this is otherwise"},{"from":1204.89,"to":1209.57,"location":2,"content":"known as smiling and my you know exactly"},{"from":1209.57,"to":1211.52,"location":2,"content":"the kind of images people would use when"},{"from":1211.52,"to":1213.2,"location":2,"content":"trying to put out wanted criminal"},{"from":1213.2,"to":1214.79,"location":2,"content":"pictures probably not really happy"},{"from":1214.79,"to":1216.98,"location":2,"content":"pictures but you get so lost in the"},{"from":1216.98,"to":1219.29,"location":2,"content":"confirmation bias you get so lost in the"},{"from":1219.29,"to":1221.39,"location":2,"content":"correlation and the feedback loops that"},{"from":1221.39,"to":1223.37,"location":2,"content":"you end up overlooking these really"},{"from":1223.37,"to":1227.6,"location":2,"content":"obvious kinds of things so that's an"},{"from":1227.6,"to":1229.7,"location":2,"content":"example of selection bias experimenters"},{"from":1229.7,"to":1232.31,"location":2,"content":"bias confirmation bias correlation"},{"from":1232.31,"to":1234.29,"location":2,"content":"fallacy and feedback loops all coming"},{"from":1234.29,"to":1236.75,"location":2,"content":"together to create a deep learning"},{"from":1236.75,"to":1238.52,"location":2,"content":"system that people think is scary and"},{"from":1238.52,"to":1240.97,"location":2,"content":"can do things that it can't actually do"},{"from":1240.97,"to":1243.44,"location":2,"content":"one of the issues with this was that the"},{"from":1243.44,"to":1245.63,"location":2,"content":"media loved it like it's was all over"},{"from":1245.63,"to":1247.28,"location":2,"content":"the news and there's been similar kinds"},{"from":1247.28,"to":1248.57,"location":2,"content":"of things happening again and again"},{"from":1248.57,"to":1251.54,"location":2,"content":"media wants to sell this story and so"},{"from":1251.54,"to":1253.94,"location":2,"content":"it's part of our job as researchers that"},{"from":1253.94,"to":1255.86,"location":2,"content":"people who work on this stuff to be very"},{"from":1255.86,"to":1257.48,"location":2,"content":"clear about what the technology is"},{"from":1257.48,"to":1260.03,"location":2,"content":"actually doing and make a distinction"},{"from":1260.03,"to":1261.68,"location":2,"content":"between what you might think it's doing"},{"from":1261.68,"to":1264.68,"location":2,"content":"and what it's actually doing um so"},{"from":1264.68,"to":1267.01,"location":2,"content":"another issue that has come up recently"},{"from":1267.01,"to":1269.42,"location":2,"content":"is claiming to be able to predict"},{"from":1269.42,"to":1271.67,"location":2,"content":"internal qualities but specifically ones"},{"from":1271.67,"to":1273.41,"location":2,"content":"that are subject to discrimination and"},{"from":1273.41,"to":1276.77,"location":2,"content":"loss of opportunity so in particular"},{"from":1276.77,"to":1278.3,"location":2,"content":"there was this work that came out that"},{"from":1278.3,"to":1279.98,"location":2,"content":"claimed to be able to predict whether or"},{"from":1279.98,"to":1282.44,"location":2,"content":"not someone was homosexual just based on"},{"from":1282.44,"to":1285.23,"location":2,"content":"single face images now it's important to"},{"from":1285.23,"to":1287.42,"location":2,"content":"know that the images that they used in"},{"from":1287.42,"to":1289.49,"location":2,"content":"the study included images that were from"},{"from":1289.49,"to":1291.28,"location":2,"content":"dating websites where people"},{"from":1291.28,"to":1293.33,"location":2,"content":"self-identified as straight or gay and"},{"from":1293.33,"to":1295.1,"location":2,"content":"identified as whether they were looking"},{"from":1295.1,"to":1296.93,"location":2,"content":"for a partner who is straight or gay and"},{"from":1296.93,"to":1299.42,"location":2,"content":"these became the sources of the training"},{"from":1299.42,"to":1302.51,"location":2,"content":"data and still from this Oh before I go"},{"from":1302.51,"to":1304.7,"location":2,"content":"on can you guys just understand just"},{"from":1304.7,"to":1307.3,"location":2,"content":"from that what the issue might have been"},{"from":1307.3,"to":1311.82,"location":2,"content":"imposed"},{"from":1311.82,"to":1313.75,"location":2,"content":"there was actually anything about"},{"from":1313.75,"to":1319.23,"location":2,"content":"rainbows but that's really unfortunate"},{"from":1319.23,"to":1321.97,"location":2,"content":"right yes this has more to do with the"},{"from":1321.97,"to":1323.68,"location":2,"content":"presentation of the self the"},{"from":1323.68,"to":1325.33,"location":2,"content":"presentation of the social self when"},{"from":1325.33,"to":1327.46,"location":2,"content":"you're trying to for example attract a"},{"from":1327.46,"to":1329.74,"location":2,"content":"partner on a website and less to do with"},{"from":1329.74,"to":1332.89,"location":2,"content":"how you look day-to-day and yet they"},{"from":1332.89,"to":1336.37,"location":2,"content":"kind of went to these large conclusions"},{"from":1336.37,"to":1338.62,"location":2,"content":"that aren't supported at all by the data"},{"from":1338.62,"to":1340.72,"location":2,"content":"or by their study but things like"},{"from":1340.72,"to":1343.18,"location":2,"content":"consistent with a prenatal formula of"},{"from":1343.18,"to":1345.61,"location":2,"content":"sexual orientation gay men and women"},{"from":1345.61,"to":1347.56,"location":2,"content":"tended to have gender atypical facial"},{"from":1347.56,"to":1349.96,"location":2,"content":"morphology now none of the authors"},{"from":1349.96,"to":1352.51,"location":2,"content":"actually were prenatal hormone Theory"},{"from":1352.51,"to":1355.57,"location":2,"content":"specialists you know they had doctor in"},{"from":1355.57,"to":1356.74,"location":2,"content":"their name so maybe that's the thing"},{"from":1356.74,"to":1359.11,"location":2,"content":"this was a Stanford professor and like"},{"from":1359.11,"to":1361.12,"location":2,"content":"I've presented this a few times at"},{"from":1361.12,"to":1362.47,"location":2,"content":"Stanford and gotten into some like"},{"from":1362.47,"to":1364.96,"location":2,"content":"pretty harsh fights about this so I'm"},{"from":1364.96,"to":1368.34,"location":2,"content":"ready if anyone wants to take me on but"},{"from":1368.34,"to":1371.53,"location":2,"content":"but me and my some of my colleagues"},{"from":1371.53,"to":1373.45,"location":2,"content":"decided we we play around with this a"},{"from":1373.45,"to":1375.67,"location":2,"content":"bit what we found was that a simple"},{"from":1375.67,"to":1377.98,"location":2,"content":"decision tree so I'm kind of assuming"},{"from":1377.98,"to":1379.86,"location":2,"content":"you guys know what a decision tree is"},{"from":1379.86,"to":1383.14,"location":2,"content":"okay cool so based on wearing makeup or"},{"from":1383.14,"to":1385.33,"location":2,"content":"wearing glasses God is pretty close to"},{"from":1385.33,"to":1387.34,"location":2,"content":"the accuracy reported in the paper"},{"from":1387.34,"to":1389.14,"location":2,"content":"that says nothing about internal"},{"from":1389.14,"to":1390.73,"location":2,"content":"hormones that says nothing about any of"},{"from":1390.73,"to":1392.89,"location":2,"content":"that and says a lot about the physical"},{"from":1392.89,"to":1394.72,"location":2,"content":"presentation the things that are on the"},{"from":1394.72,"to":1397.33,"location":2,"content":"surface it says a lot more about how"},{"from":1397.33,"to":1399.04,"location":2,"content":"people are presenting themselves and"},{"from":1399.04,"to":1402.25,"location":2,"content":"what is happening internally so the key"},{"from":1402.25,"to":1403.66,"location":2,"content":"thing that's recently kind of been"},{"from":1403.66,"to":1405.79,"location":2,"content":"overlooked is that deep learning is"},{"from":1405.79,"to":1407.89,"location":2,"content":"somehow it's sort of considered that"},{"from":1407.89,"to":1409.78,"location":2,"content":"it's somehow magically going beyond"},{"from":1409.78,"to":1411.91,"location":2,"content":"surface level but the point is that it's"},{"from":1411.91,"to":1413.65,"location":2,"content":"working on the surface level and working"},{"from":1413.65,"to":1415.9,"location":2,"content":"well and in the face of confirmation"},{"from":1415.9,"to":1417.94,"location":2,"content":"bias and other kinds of bias factors"},{"from":1417.94,"to":1419.98,"location":2,"content":"it's easy to assume that something else"},{"from":1419.98,"to":1422.44,"location":2,"content":"is happening that's not without critical"},{"from":1422.44,"to":1425.97,"location":2,"content":"examination for example simple baselines"},{"from":1425.97,"to":1428.83,"location":2,"content":"simple sanity checks and these kinds of"},{"from":1428.83,"to":1430.93,"location":2,"content":"things can just be ignored and and not"},{"from":1430.93,"to":1435.12,"location":2,"content":"noticed at all so that's example of"},{"from":1435.12,"to":1437.86,"location":2,"content":"selection bias and experimenters bias"},{"from":1437.86,"to":1441.37,"location":2,"content":"and correlation fallacy"},{"from":1441.37,"to":1443.89,"location":2,"content":"okay so now I'm going to talk to talk"},{"from":1443.89,"to":1446.08,"location":2,"content":"about measuring algorithmic bias so I"},{"from":1446.08,"to":1448.42,"location":2,"content":"just said a lot about different kinds of"},{"from":1448.42,"to":1450.91,"location":2,"content":"biases that come in in the data in the"},{"from":1450.91,"to":1453.25,"location":2,"content":"collection in the interpretation of the"},{"from":1453.25,"to":1454.87,"location":2,"content":"results let's talk about actually"},{"from":1454.87,"to":1457.03,"location":2,"content":"quantitatively measuring different kinds"},{"from":1457.03,"to":1460.69,"location":2,"content":"of biases so one of the key things"},{"from":1460.69,"to":1463.42,"location":2,"content":"that's emerged in a few different works"},{"from":1463.42,"to":1465.82,"location":2,"content":"and really ties nicely to a lot of"},{"from":1465.82,"to":1467.41,"location":2,"content":"fairness work is this idea of"},{"from":1467.41,"to":1470.26,"location":2,"content":"disaggregated evaluation so in"},{"from":1470.26,"to":1472.6,"location":2,"content":"disaggregated evaluation you evaluate"},{"from":1472.6,"to":1474.76,"location":2,"content":"across different subgroups as opposed to"},{"from":1474.76,"to":1477.43,"location":2,"content":"looking at one single score for your"},{"from":1477.43,"to":1481.78,"location":2,"content":"overall testing data set so okay you"},{"from":1481.78,"to":1482.83,"location":2,"content":"guys are probably familiar with the"},{"from":1482.83,"to":1484.57,"location":2,"content":"training testing data split you kind of"},{"from":1484.57,"to":1486.88,"location":2,"content":"train on there on your given training"},{"from":1486.88,"to":1488.89,"location":2,"content":"data you test on your given testing data"},{"from":1488.89,"to":1490.92,"location":2,"content":"and you point you report like precision"},{"from":1490.92,"to":1494.29,"location":2,"content":"recall F score or things like that but"},{"from":1494.29,"to":1496.81,"location":2,"content":"what that masks is how well the system"},{"from":1496.81,"to":1498.31,"location":2,"content":"is actually working across different"},{"from":1498.31,"to":1499.72,"location":2,"content":"kinds of individuals and across"},{"from":1499.72,"to":1502.75,"location":2,"content":"different different subgroups and so one"},{"from":1502.75,"to":1504.79,"location":2,"content":"just straightforward way to handle this"},{"from":1504.79,"to":1507.07,"location":2,"content":"is to actually evaluate with respect to"},{"from":1507.07,"to":1509.14,"location":2,"content":"those different subgroups so creating"},{"from":1509.14,"to":1510.67,"location":2,"content":"for each sort of subgroup prediction"},{"from":1510.67,"to":1513.97,"location":2,"content":"pair so for an example you might look at"},{"from":1513.97,"to":1516.52,"location":2,"content":"women face detection men face detection"},{"from":1516.52,"to":1519.12,"location":2,"content":"and look at how the the error rates are"},{"from":1519.12,"to":1523.9,"location":2,"content":"different or similar um another"},{"from":1523.9,"to":1525.67,"location":2,"content":"important part of this is to look at"},{"from":1525.67,"to":1528.87,"location":2,"content":"things intersectionally combining things"},{"from":1528.87,"to":1531.91,"location":2,"content":"like gender and race at the same time"},{"from":1531.91,"to":1534.79,"location":2,"content":"and seeing how those how the error rates"},{"from":1534.79,"to":1537.31,"location":2,"content":"on those sorts of things change and how"},{"from":1537.31,"to":1538.15,"location":2,"content":"they're different across different"},{"from":1538.15,"to":1541.24,"location":2,"content":"intersections and this is inspired by"},{"from":1541.24,"to":1542.25,"location":2,"content":"kimberlé crenshaw"},{"from":1542.25,"to":1544.75,"location":2,"content":"because she she pioneered intersectional"},{"from":1544.75,"to":1548.89,"location":2,"content":"research in critical race theory and she"},{"from":1548.89,"to":1550.45,"location":2,"content":"discussed the story of emma de Graaff"},{"from":1550.45,"to":1554.64,"location":2,"content":"infeed who was a woman at General Motors"},{"from":1554.64,"to":1556.84,"location":2,"content":"and she claimed that the company is"},{"from":1556.84,"to":1559,"location":2,"content":"hiring practices discriminated against"},{"from":1559,"to":1561.7,"location":2,"content":"black women but in their Court opinion"},{"from":1561.7,"to":1563.74,"location":2,"content":"the judges ruled that General Motors"},{"from":1563.74,"to":1566.83,"location":2,"content":"hired many women for secretarial"},{"from":1566.83,"to":1568.96,"location":2,"content":"positions and many black people her"},{"from":1568.96,"to":1571.54,"location":2,"content":"factory roles and thus they could not"},{"from":1571.54,"to":1574.12,"location":2,"content":"have discriminated against black women"},{"from":1574.12,"to":1576.04,"location":2,"content":"what they failed to do was look at the"},{"from":1576.04,"to":1577.75,"location":2,"content":"intersection of the two and understand"},{"from":1577.75,"to":1579.13,"location":2,"content":"that the experience there might be"},{"from":1579.13,"to":1581.29,"location":2,"content":"fundamentally different than any of the"},{"from":1581.29,"to":1584.08,"location":2,"content":"experiences of either of these sort of"},{"from":1584.08,"to":1587.02,"location":2,"content":"subgroups in isolation and the same"},{"from":1587.02,"to":1589.36,"location":2,"content":"becomes true when you start looking at"},{"from":1589.36,"to":1591.34,"location":2,"content":"errors that are regularly made in deep"},{"from":1591.34,"to":1593.65,"location":2,"content":"learning systems so we've been able to"},{"from":1593.65,"to":1595.33,"location":2,"content":"uncover a lot of different kinds of"},{"from":1595.33,"to":1597.22,"location":2,"content":"unintended errors by looking not only at"},{"from":1597.22,"to":1600.07,"location":2,"content":"the disaggregated evaluation but also at"},{"from":1600.07,"to":1603.78,"location":2,"content":"intersectional disaggregated evaluation"},{"from":1603.78,"to":1605.86,"location":2,"content":"so I'm going to walk through a bit how"},{"from":1605.86,"to":1607.96,"location":2,"content":"this works this is probably going to be"},{"from":1607.96,"to":1610.18,"location":2,"content":"review for most of you but I think it's"},{"from":1610.18,"to":1611.56,"location":2,"content":"really important to understand this"},{"from":1611.56,"to":1613.72,"location":2,"content":"because it also ties to how we measure"},{"from":1613.72,"to":1615.66,"location":2,"content":"fairness and when we say like"},{"from":1615.66,"to":1617.74,"location":2,"content":"algorithmic fairness what we're talking"},{"from":1617.74,"to":1621.79,"location":2,"content":"about so um the confusion matrix is a"},{"from":1621.79,"to":1623.59,"location":2,"content":"way you guys okay are you guys familiar"},{"from":1623.59,"to":1625.78,"location":2,"content":"with the Confucian matrix I just want to"},{"from":1625.78,"to":1628.21,"location":2,"content":"know where okay awesome cool so familiar"},{"from":1628.21,"to":1629.29,"location":2,"content":"to take from you is a matrix right so"},{"from":1629.29,"to":1630.46,"location":2,"content":"you have model predictions and"},{"from":1630.46,"to":1632.86,"location":2,"content":"references and you can kind of look at"},{"from":1632.86,"to":1635.26,"location":2,"content":"these as negative and positive binary"},{"from":1635.26,"to":1637.93,"location":2,"content":"classification kind of approach here"},{"from":1637.93,"to":1641.23,"location":2,"content":"where if the ground truth says something"},{"from":1641.23,"to":1643.12,"location":2,"content":"is true and the model predicts it's true"},{"from":1643.12,"to":1645.13,"location":2,"content":"it's a true positive if the ground truth"},{"from":1645.13,"to":1649.06,"location":2,"content":"says it's it's it's false and the model"},{"from":1649.06,"to":1650.49,"location":2,"content":"predicts it's false it's true negative"},{"from":1650.49,"to":1653.26,"location":2,"content":"and the air is the kind of different"},{"from":1653.26,"to":1654.91,"location":2,"content":"issues that arise are false negatives"},{"from":1654.91,"to":1657.52,"location":2,"content":"and false positives so in false"},{"from":1657.52,"to":1660.91,"location":2,"content":"positives the the ground truth says"},{"from":1660.91,"to":1662.74,"location":2,"content":"something is negative but the model"},{"from":1662.74,"to":1665.53,"location":2,"content":"predicts that it's positive and then in"},{"from":1665.53,"to":1667.65,"location":2,"content":"false negatives vice versa"},{"from":1667.65,"to":1671.35,"location":2,"content":"from these you know basic kind of this"},{"from":1671.35,"to":1673.81,"location":2,"content":"basic breakdown of errors you can get a"},{"from":1673.81,"to":1677.2,"location":2,"content":"few different metrics these metrics"},{"from":1677.2,"to":1679.45,"location":2,"content":"actually trivially map to a lot of"},{"from":1679.45,"to":1682.18,"location":2,"content":"different fairness criteria so for"},{"from":1682.18,"to":1684.1,"location":2,"content":"example if we're looking at something"},{"from":1684.1,"to":1686.89,"location":2,"content":"like female versus male patient results"},{"from":1686.89,"to":1689.08,"location":2,"content":"and figuring out things like precision"},{"from":1689.08,"to":1691.06,"location":2,"content":"and recall which is relatively common in"},{"from":1691.06,"to":1695.5,"location":2,"content":"NLP if you have equal recall across your"},{"from":1695.5,"to":1696.16,"location":2,"content":"subgroups"},{"from":1696.16,"to":1698.71,"location":2,"content":"that's the same as the fairness criteria"},{"from":1698.71,"to":1702.79,"location":2,"content":"of equality of opportunity I could work"},{"from":1702.79,"to":1704.05,"location":2,"content":"through the math but I mean this is"},{"from":1704.05,"to":1705.88,"location":2,"content":"basically just just the main point that"},{"from":1705.88,"to":1707.57,"location":2,"content":"that"},{"from":1707.57,"to":1709.85,"location":2,"content":"it says that given that something is"},{"from":1709.85,"to":1713.21,"location":2,"content":"true in the ground truth the model"},{"from":1713.21,"to":1716.09,"location":2,"content":"should predict that it's true at equal"},{"from":1716.09,"to":1717.89,"location":2,"content":"rates across different subgroups so this"},{"from":1717.89,"to":1719.63,"location":2,"content":"ends up being equivalent to having the"},{"from":1719.63,"to":1721.84,"location":2,"content":"same recall across different subgroups"},{"from":1721.84,"to":1724.85,"location":2,"content":"similarly having the same precision"},{"from":1724.85,"to":1727.37,"location":2,"content":"across different subgroups is equivalent"},{"from":1727.37,"to":1729.17,"location":2,"content":"to a fairness criterion called"},{"from":1729.17,"to":1732.65,"location":2,"content":"predictive parity and so as fairness has"},{"from":1732.65,"to":1735.86,"location":2,"content":"been defined again and again it was"},{"from":1735.86,"to":1737.48,"location":2,"content":"originally some of these definitions"},{"from":1737.48,"to":1741.17,"location":2,"content":"came in 1966 following the Civil Rights"},{"from":1741.17,"to":1745.16,"location":2,"content":"Act of 1964 they were reinvented a few"},{"from":1745.16,"to":1748.51,"location":2,"content":"times and most recently reinvented in"},{"from":1748.51,"to":1752.51,"location":2,"content":"2016 but they all sort of boiled down to"},{"from":1752.51,"to":1754.79,"location":2,"content":"this disaggregated comparison across"},{"from":1754.79,"to":1757.31,"location":2,"content":"subgroups and the math the metrics end"},{"from":1757.31,"to":1759.29,"location":2,"content":"up being roughly equivalent to what we"},{"from":1759.29,"to":1760.48,"location":2,"content":"get from the confusion matrix"},{"from":1760.48,"to":1766.03,"location":2,"content":"specifically in classification systems"},{"from":1766.03,"to":1768.95,"location":2,"content":"so which kind of fairness metric do you"},{"from":1768.95,"to":1771.56,"location":2,"content":"use what are the different criteria you"},{"from":1771.56,"to":1773.69,"location":2,"content":"want to use to look at the differences"},{"from":1773.69,"to":1775.61,"location":2,"content":"across different subgroups that really"},{"from":1775.61,"to":1777.86,"location":2,"content":"it comes down to the trade-offs between"},{"from":1777.86,"to":1780.08,"location":2,"content":"false positives and false negatives so"},{"from":1780.08,"to":1781.49,"location":2,"content":"this is the same problem that you're"},{"from":1781.49,"to":1782.63,"location":2,"content":"dealing with when you're just figuring"},{"from":1782.63,"to":1785.24,"location":2,"content":"out how to evaluate generally there's no"},{"from":1785.24,"to":1787.01,"location":2,"content":"one fairness criterion that is the"},{"from":1787.01,"to":1789.43,"location":2,"content":"fairness criterion to rule them all"},{"from":1789.43,"to":1791.39,"location":2,"content":"deciding which one is better than the"},{"from":1791.39,"to":1793.19,"location":2,"content":"other is the same as kind of trying to"},{"from":1793.19,"to":1794.75,"location":2,"content":"decide which is better precision or"},{"from":1794.75,"to":1796.22,"location":2,"content":"recall right it depends on what the"},{"from":1796.22,"to":1797.66,"location":2,"content":"problem is and what you're interested in"},{"from":1797.66,"to":1800.93,"location":2,"content":"measuring so a case where false"},{"from":1800.93,"to":1803.3,"location":2,"content":"positives might be better than false"},{"from":1803.3,"to":1805.55,"location":2,"content":"negatives and so you want to prioritize"},{"from":1805.55,"to":1807.7,"location":2,"content":"something like a false positive right"},{"from":1807.7,"to":1810.92,"location":2,"content":"across subgroups is privacy in images so"},{"from":1810.92,"to":1812.6,"location":2,"content":"here are false positive is something"},{"from":1812.6,"to":1814.76,"location":2,"content":"that doesn't need to be blurred gets"},{"from":1814.76,"to":1817.37,"location":2,"content":"blurred that's just kind of a bummer but"},{"from":1817.37,"to":1818.66,"location":2,"content":"a false negative would be something that"},{"from":1818.66,"to":1821,"location":2,"content":"needs to be blurred as not learned and"},{"from":1821,"to":1822.8,"location":2,"content":"that can be identity theft it's a much"},{"from":1822.8,"to":1825.29,"location":2,"content":"more serious issue and so it's important"},{"from":1825.29,"to":1827.21,"location":2,"content":"to prioritize the evaluation metrics"},{"from":1827.21,"to":1830.78,"location":2,"content":"that stress the false negative rates an"},{"from":1830.78,"to":1832.91,"location":2,"content":"example where false negatives might be"},{"from":1832.91,"to":1834.65,"location":2,"content":"better than false positives as in spam"},{"from":1834.65,"to":1837.26,"location":2,"content":"filtering so a false negative could be"},{"from":1837.26,"to":1838.87,"location":2,"content":"an email that's spam"},{"from":1838.87,"to":1840.84,"location":2,"content":"not caught so you see it in your inbox"},{"from":1840.84,"to":1843.49,"location":2,"content":"that's usually just annoying it's not a"},{"from":1843.49,"to":1845.86,"location":2,"content":"big deal and but if false positive here"},{"from":1845.86,"to":1847.87,"location":2,"content":"would be email flagged as spam and then"},{"from":1847.87,"to":1850.45,"location":2,"content":"removed from your inbox which you know"},{"from":1850.45,"to":1853.18,"location":2,"content":"if it's from a friend or a loved one it"},{"from":1853.18,"to":1855.01,"location":2,"content":"can be it can be a loss may be a job"},{"from":1855.01,"to":1857.16,"location":2,"content":"offer or something like that"},{"from":1857.16,"to":1861.13,"location":2,"content":"right so I just kind of covered how a I"},{"from":1861.13,"to":1863.32,"location":2,"content":"can unintentionally timejust outcomes"},{"from":1863.32,"to":1865.48,"location":2,"content":"and some of the things to do or some of"},{"from":1865.48,"to":1866.89,"location":2,"content":"the things to be aware of here"},{"from":1866.89,"to":1869.29,"location":2,"content":"are the lack of insight into sources of"},{"from":1869.29,"to":1871.08,"location":2,"content":"bias in the data in the model"},{"from":1871.08,"to":1873.85,"location":2,"content":"lack of insight into the feedback loops"},{"from":1873.85,"to":1876.25,"location":2,"content":"from the original data that's collected"},{"from":1876.25,"to":1879.55,"location":2,"content":"as an example of what humans do to the"},{"from":1879.55,"to":1882.52,"location":2,"content":"data that's then repurposed reused acted"},{"from":1882.52,"to":1885.58,"location":2,"content":"on and then further fed in a lack of"},{"from":1885.58,"to":1888.43,"location":2,"content":"careful disaggregated evaluation looking"},{"from":1888.43,"to":1890.2,"location":2,"content":"at the disparities the differences"},{"from":1890.2,"to":1892.33,"location":2,"content":"between different subgroups in order to"},{"from":1892.33,"to":1894.28,"location":2,"content":"understand this bias this difference"},{"from":1894.28,"to":1896.65,"location":2,"content":"across the subgroups and then human"},{"from":1896.65,"to":1898.9,"location":2,"content":"biases in interpreting and accepting and"},{"from":1898.9,"to":1900.94,"location":2,"content":"talking about the results which then"},{"from":1900.94,"to":1903.34,"location":2,"content":"kind of further the media cycles and the"},{"from":1903.34,"to":1908.11,"location":2,"content":"hype around AI right now but it's up to"},{"from":1908.11,"to":1912.34,"location":2,"content":"us to influence how AI evolves so I like"},{"from":1912.34,"to":1914.95,"location":2,"content":"to think of this in terms of short term"},{"from":1914.95,"to":1917.73,"location":2,"content":"middle term and long term objectives so"},{"from":1917.73,"to":1921.76,"location":2,"content":"short term today we might be working on"},{"from":1921.76,"to":1923.68,"location":2,"content":"some specific modal where we're trying"},{"from":1923.68,"to":1925.51,"location":2,"content":"to find some local optimum we have a"},{"from":1925.51,"to":1927.28,"location":2,"content":"task we have data something like that"},{"from":1927.28,"to":1929.97,"location":2,"content":"and that sort of short term objectives"},{"from":1929.97,"to":1932.29,"location":2,"content":"we might have a slightly longer term"},{"from":1932.29,"to":1933.94,"location":2,"content":"objective of getting a paper published"},{"from":1933.94,"to":1936.07,"location":2,"content":"or if you're an industry like adding a"},{"from":1936.07,"to":1939.22,"location":2,"content":"product launched whatever it might be"},{"from":1939.22,"to":1940.66,"location":2,"content":"from there we might see our next"},{"from":1940.66,"to":1943.75,"location":2,"content":"endpoint as getting an award or you know"},{"from":1943.75,"to":1945.37,"location":2,"content":"maybe become sort of famous for"},{"from":1945.37,"to":1946.6,"location":2,"content":"something for a few minutes something"},{"from":1946.6,"to":1949.42,"location":2,"content":"like that and that's cool and but"},{"from":1949.42,"to":1951.28,"location":2,"content":"there's a longer-term objective that we"},{"from":1951.28,"to":1953.32,"location":2,"content":"can work towards as well at the same"},{"from":1953.32,"to":1955.24,"location":2,"content":"time and that's something like a"},{"from":1955.24,"to":1957.19,"location":2,"content":"positive outcome for humans in their"},{"from":1957.19,"to":1959.53,"location":2,"content":"environment so instead of just kind of"},{"from":1959.53,"to":1962.26,"location":2,"content":"focusing on these local decisions these"},{"from":1962.26,"to":1964.36,"location":2,"content":"local optimum and these sort of local"},{"from":1964.36,"to":1967.42,"location":2,"content":"paper by paper based approaches to"},{"from":1967.42,"to":1969.52,"location":2,"content":"solving problems you can also kind of"},{"from":1969.52,"to":1970.78,"location":2,"content":"think about what's the long-term"},{"from":1970.78,"to":1972.23,"location":2,"content":"objective where does this"},{"from":1972.23,"to":1974.81,"location":2,"content":"me as they trace out an evolutionary"},{"from":1974.81,"to":1977.03,"location":2,"content":"path for artificial intelligence down"},{"from":1977.03,"to":1981.94,"location":2,"content":"the line in 10 years 15 years 20 years"},{"from":1981.94,"to":1985.16,"location":2,"content":"and one of the ways you can address this"},{"from":1985.16,"to":1987.14,"location":2,"content":"is by thinking you know how can the work"},{"from":1987.14,"to":1989.57,"location":2,"content":"I'm interested in now be best focused to"},{"from":1989.57,"to":1991.46,"location":2,"content":"help others and that involves talking to"},{"from":1991.46,"to":1993.41,"location":2,"content":"experts and kind of going outside your"},{"from":1993.41,"to":1995.9,"location":2,"content":"bubble speaking across interdisciplinary"},{"from":1995.9,"to":1997.67,"location":2,"content":"fields like cognitive science which I've"},{"from":1997.67,"to":2001.45,"location":2,"content":"just talked a bit about so let's talk"},{"from":2001.45,"to":2004.72,"location":2,"content":"about some things we can do so first off"},{"from":2004.72,"to":2010.6,"location":2,"content":"is data so a lot of the issues of bias"},{"from":2010.6,"to":2014.05,"location":2,"content":"and fairness in machine learning models"},{"from":2014.05,"to":2015.79,"location":2,"content":"really come down to the data"},{"from":2015.79,"to":2018.25,"location":2,"content":"unfortunately in machine learning and"},{"from":2018.25,"to":2021.31,"location":2,"content":"deep learning working on data is really"},{"from":2021.31,"to":2024.49,"location":2,"content":"not seen as sexy there's a few data sets"},{"from":2024.49,"to":2027.64,"location":2,"content":"that people use they're out there that's"},{"from":2027.64,"to":2029.47,"location":2,"content":"what people use and there's not a lot of"},{"from":2029.47,"to":2031.78,"location":2,"content":"analysis done on it on how well these"},{"from":2031.78,"to":2034.21,"location":2,"content":"datasets capture different truths about"},{"from":2034.21,"to":2038.49,"location":2,"content":"the world how problematic they might be"},{"from":2038.49,"to":2041.23,"location":2,"content":"but it's a pretty wide area that needs a"},{"from":2041.23,"to":2043.42,"location":2,"content":"lot of future like lead needs a lot of"},{"from":2043.42,"to":2046.18,"location":2,"content":"future additional work so we're gonna"},{"from":2046.18,"to":2047.74,"location":2,"content":"understand the data skews and the"},{"from":2047.74,"to":2049.75,"location":2,"content":"correlations if you understand your data"},{"from":2049.75,"to":2052.63,"location":2,"content":"skews and the correlations that might be"},{"from":2052.63,"to":2054.49,"location":2,"content":"problematic in your data then you can"},{"from":2054.49,"to":2056.32,"location":2,"content":"start working on either models that"},{"from":2056.32,"to":2058.78,"location":2,"content":"address those or data augmentation"},{"from":2058.78,"to":2061.06,"location":2,"content":"approaches in order to sort of make the"},{"from":2061.06,"to":2062.71,"location":2,"content":"data set a little bit better or a little"},{"from":2062.71,"to":2064.69,"location":2,"content":"bit more representative of how you want"},{"from":2064.69,"to":2067.75,"location":2,"content":"the world to be it's also important to"},{"from":2067.75,"to":2070.24,"location":2,"content":"abandon the single training set testing"},{"from":2070.24,"to":2072.28,"location":2,"content":"set from similar distribution approach"},{"from":2072.28,"to":2076.84,"location":2,"content":"to advancing deep learning so when we do"},{"from":2076.84,"to":2078.31,"location":2,"content":"projects in deep learning you know we"},{"from":2078.31,"to":2079.78,"location":2,"content":"tend to have the training set and the"},{"from":2079.78,"to":2081.43,"location":2,"content":"testing set and then that's what we sort"},{"from":2081.43,"to":2083.65,"location":2,"content":"of benchmark on and prioritize but the"},{"from":2083.65,"to":2085.3,"location":2,"content":"point is as you move around different"},{"from":2085.3,"to":2086.98,"location":2,"content":"testing sets you're going to get vastly"},{"from":2086.98,"to":2090.37,"location":2,"content":"different results and so by keeping in"},{"from":2090.37,"to":2092.26,"location":2,"content":"this just sort of one training testing"},{"from":2092.26,"to":2095.44,"location":2,"content":"date training testing data set paradigm"},{"from":2095.44,"to":2097.69,"location":2,"content":"you're really likely to not notice"},{"from":2097.69,"to":2100.15,"location":2,"content":"issues that might otherwise be there and"},{"from":2100.15,"to":2102.25,"location":2,"content":"one way to really focus in on them is"},{"from":2102.25,"to":2106.57,"location":2,"content":"having a hard set of test cases"},{"from":2106.57,"to":2108.16,"location":2,"content":"you really want to make sure the model"},{"from":2108.16,"to":2109.87,"location":2,"content":"does well on so these are things that"},{"from":2109.87,"to":2112.57,"location":2,"content":"are particularly problematic things that"},{"from":2112.57,"to":2114.73,"location":2,"content":"would be really harmful to individuals"},{"from":2114.73,"to":2117.16,"location":2,"content":"if they were to experience the output"},{"from":2117.16,"to":2119.71,"location":2,"content":"and you kind of collect those in a small"},{"from":2119.71,"to":2121.66,"location":2,"content":"test set and then it's really easy to"},{"from":2121.66,"to":2124.36,"location":2,"content":"evaluate on that test set as you"},{"from":2124.36,"to":2126.1,"location":2,"content":"benchmark improvements on your model as"},{"from":2126.1,"to":2127.72,"location":2,"content":"you add different kinds of things to"},{"from":2127.72,"to":2130.6,"location":2,"content":"your model in order to see not just how"},{"from":2130.6,"to":2132.67,"location":2,"content":"your model is doing overall in terms of"},{"from":2132.67,"to":2134.56,"location":2,"content":"your testing data set but how well"},{"from":2134.56,"to":2136.48,"location":2,"content":"you're doing in terms of these examples"},{"from":2136.48,"to":2138.73,"location":2,"content":"you really want it to do well on that"},{"from":2138.73,"to":2140.65,"location":2,"content":"you know that it's going to be a problem"},{"from":2140.65,"to":2142.72,"location":2,"content":"if it doesn't do well on and any sort of"},{"from":2142.72,"to":2144.46,"location":2,"content":"degradation in that you might want to"},{"from":2144.46,"to":2147.97,"location":2,"content":"prioritize to fix above degradation"},{"from":2147.97,"to":2151.57,"location":2,"content":"degradation and overall accuracy and"},{"from":2151.57,"to":2153.25,"location":2,"content":"it's also important to talk to experts"},{"from":2153.25,"to":2154.96,"location":2,"content":"about the additional signals that you"},{"from":2154.96,"to":2159.52,"location":2,"content":"can incorporate so we've put out a tool"},{"from":2159.52,"to":2161.59,"location":2,"content":"to help with this understanding data"},{"from":2161.59,"to":2163.99,"location":2,"content":"SKUs called facets it's just available"},{"from":2163.99,"to":2166.81,"location":2,"content":"there and it's a really handy kind of"},{"from":2166.81,"to":2170.86,"location":2,"content":"visualizer for slicing understanding you"},{"from":2170.86,"to":2171.97,"location":2,"content":"know what some of the differences are"},{"from":2171.97,"to":2173.38,"location":2,"content":"between different subgroups and"},{"from":2173.38,"to":2174.73,"location":2,"content":"different representations and you can"},{"from":2174.73,"to":2176.83,"location":2,"content":"sort of dig in and explore a bit more so"},{"from":2176.83,"to":2178.93,"location":2,"content":"this is just to sort of help people come"},{"from":2178.93,"to":2180.16,"location":2,"content":"to terms with the data that they're"},{"from":2180.16,"to":2182.08,"location":2,"content":"actually using and and where there might"},{"from":2182.08,"to":2185.53,"location":2,"content":"be unwanted associations or or missing"},{"from":2185.53,"to":2190.54,"location":2,"content":"missing kind of features another"},{"from":2190.54,"to":2192.31,"location":2,"content":"approach that's been put forward"},{"from":2192.31,"to":2195.34,"location":2,"content":"recently specifically on the data side"},{"from":2195.34,"to":2197.8,"location":2,"content":"is this data data sheets for data sets"},{"from":2197.8,"to":2200.65,"location":2,"content":"approach so this is this idea that when"},{"from":2200.65,"to":2202.84,"location":2,"content":"you release a data set it's not enough"},{"from":2202.84,"to":2204.7,"location":2,"content":"to just release the data set with like"},{"from":2204.7,"to":2206.83,"location":2,"content":"some pretty graphs and like talking"},{"from":2206.83,"to":2208.6,"location":2,"content":"about basic distributional information"},{"from":2208.6,"to":2210.22,"location":2,"content":"you need to talk about who the"},{"from":2210.22,"to":2212.65,"location":2,"content":"annotators were where they were what the"},{"from":2212.65,"to":2214.63,"location":2,"content":"inner entertainer agreement was what"},{"from":2214.63,"to":2216.78,"location":2,"content":"their background information was"},{"from":2216.78,"to":2219.28,"location":2,"content":"motivation for the data set all these"},{"from":2219.28,"to":2220.9,"location":2,"content":"other kinds of details so now you"},{"from":2220.9,"to":2222.7,"location":2,"content":"actually know that this isn't just a"},{"from":2222.7,"to":2224.89,"location":2,"content":"data set this is a data set that has"},{"from":2224.89,"to":2227.35,"location":2,"content":"these specific biases there's no such"},{"from":2227.35,"to":2229.36,"location":2,"content":"thing as a data set that isn't biased in"},{"from":2229.36,"to":2231.73,"location":2,"content":"some way a data set by virtue of the"},{"from":2231.73,"to":2233.44,"location":2,"content":"fact that it's collected from the world"},{"from":2233.44,"to":2237.22,"location":2,"content":"as a subset is a is a biased set of the"},{"from":2237.22,"to":2239.23,"location":2,"content":"world in some way the point is to make"},{"from":2239.23,"to":2239.83,"location":2,"content":"it clear"},{"from":2239.83,"to":2241.81,"location":2,"content":"what it is how it is biased what are the"},{"from":2241.81,"to":2244,"location":2,"content":"what are the various biases that's that"},{"from":2244,"to":2245.41,"location":2,"content":"important to know about in the data set"},{"from":2245.41,"to":2247.06,"location":2,"content":"so that's one of these ideas between"},{"from":2247.06,"to":2249.28,"location":2,"content":"behind data sheets for data sets"},{"from":2249.28,"to":2253.12,"location":2,"content":"releasing these data sets publicly all"},{"from":2253.12,"to":2254.41,"location":2,"content":"right now let's switch a little bit to"},{"from":2254.41,"to":2257.62,"location":2,"content":"machine learning so there are a couple"},{"from":2257.62,"to":2259.57,"location":2,"content":"techniques that I like to use I'll talk"},{"from":2259.57,"to":2262.99,"location":2,"content":"about two one is bias mitigation which"},{"from":2262.99,"to":2264.79,"location":2,"content":"is removing the signal for a problematic"},{"from":2264.79,"to":2269.14,"location":2,"content":"output so removing stereotyping sexism"},{"from":2269.14,"to":2271.09,"location":2,"content":"racism trying to remove these kind of"},{"from":2271.09,"to":2273.19,"location":2,"content":"effects from the model this is also"},{"from":2273.19,"to":2275.85,"location":2,"content":"sometimes called D biasing or unbiased"},{"from":2275.85,"to":2278.62,"location":2,"content":"thats a little bit of a misnomer because"},{"from":2278.62,"to":2280.48,"location":2,"content":"you're you're generally just kind of"},{"from":2280.48,"to":2283.03,"location":2,"content":"moving around bias based on a specific"},{"from":2283.03,"to":2285.58,"location":2,"content":"set of words for example so to say it's"},{"from":2285.58,"to":2287.95,"location":2,"content":"unbiased is it's not true"},{"from":2287.95,"to":2289.78,"location":2,"content":"but you are kind of mitigating bias with"},{"from":2289.78,"to":2291.4,"location":2,"content":"respect to some certain kinds of"},{"from":2291.4,"to":2295.18,"location":2,"content":"information that you provide it with and"},{"from":2295.18,"to":2296.92,"location":2,"content":"there's inclusion which is then adding"},{"from":2296.92,"to":2299.32,"location":2,"content":"signal for desired variables so that's"},{"from":2299.32,"to":2301.09,"location":2,"content":"kind of the opposite side of bias"},{"from":2301.09,"to":2303.28,"location":2,"content":"mitigation so increasing model"},{"from":2303.28,"to":2305.08,"location":2,"content":"performance with attention to subgroups"},{"from":2305.08,"to":2306.7,"location":2,"content":"or data slices with the worst"},{"from":2306.7,"to":2312.4,"location":2,"content":"performance so in order to address"},{"from":2312.4,"to":2315.31,"location":2,"content":"inclusion kind of adding signal for"},{"from":2315.31,"to":2317.53,"location":2,"content":"underrepresented subgroups one technique"},{"from":2317.53,"to":2318.85,"location":2,"content":"that's worked relatively well is"},{"from":2318.85,"to":2321.22,"location":2,"content":"multitask learning so I've heard that"},{"from":2321.22,"to":2323.17,"location":2,"content":"you guys have studied multitask learning"},{"from":2323.17,"to":2324.85,"location":2,"content":"which is great so I'll tell you a bit"},{"from":2324.85,"to":2327.25,"location":2,"content":"about a case study here and so this is"},{"from":2327.25,"to":2330.07,"location":2,"content":"work I did in collaboration with a UPenn"},{"from":2330.07,"to":2332.74,"location":2,"content":"world well-being project working"},{"from":2332.74,"to":2334.72,"location":2,"content":"directly with clinicians and the goal"},{"from":2334.72,"to":2336.43,"location":2,"content":"was to create a system that could alert"},{"from":2336.43,"to":2337.96,"location":2,"content":"clinicians if there was a suicide"},{"from":2337.96,"to":2340.51,"location":2,"content":"attempt that was imminent and they"},{"from":2340.51,"to":2342.31,"location":2,"content":"wanted to understand the feasibility of"},{"from":2342.31,"to":2344.08,"location":2,"content":"these kinds of diagnosis when there were"},{"from":2344.08,"to":2346.9,"location":2,"content":"very few training training instances"},{"from":2346.9,"to":2348.82,"location":2,"content":"available so that's similar to kind of"},{"from":2348.82,"to":2355.72,"location":2,"content":"the minority problem in datasets and in"},{"from":2355.72,"to":2359.26,"location":2,"content":"this work we had two kinds of data one"},{"from":2359.26,"to":2360.88,"location":2,"content":"was the internal data which was the"},{"from":2360.88,"to":2364.12,"location":2,"content":"electronic health records with that was"},{"from":2364.12,"to":2366.07,"location":2,"content":"either provided by the patient or from"},{"from":2366.07,"to":2367.57,"location":2,"content":"the family"},{"from":2367.57,"to":2369.72,"location":2,"content":"it included mental health diagnosis"},{"from":2369.72,"to":2372.46,"location":2,"content":"suicide attempts or completions"},{"from":2372.46,"to":2375.43,"location":2,"content":"if that were the case along with the the"},{"from":2375.43,"to":2377.52,"location":2,"content":"users the person's social media data"},{"from":2377.52,"to":2379.81,"location":2,"content":"that was the internal data that we did"},{"from":2379.81,"to":2381.58,"location":2,"content":"not publish on but that we were able to"},{"from":2381.58,"to":2383.29,"location":2,"content":"work with clinicians on in order to"},{"from":2383.29,"to":2384.79,"location":2,"content":"understand if our methods were actually"},{"from":2384.79,"to":2388.18,"location":2,"content":"working the external data the proxy data"},{"from":2388.18,"to":2389.71,"location":2,"content":"the stuff that we could kind of publish"},{"from":2389.71,"to":2391.86,"location":2,"content":"on and talk about was based on Twitter"},{"from":2391.86,"to":2395.29,"location":2,"content":"and this was using regular expressions"},{"from":2395.29,"to":2399.13,"location":2,"content":"in order to extract phrases and Twitter"},{"from":2399.13,"to":2400.87,"location":2,"content":"feeds that had something that was kind"},{"from":2400.87,"to":2403.39,"location":2,"content":"of like diagnosis so something like I've"},{"from":2403.39,"to":2405.58,"location":2,"content":"been diagnosed with X or I've tried to"},{"from":2405.58,"to":2408.16,"location":2,"content":"commit suicide and that became kind of"},{"from":2408.16,"to":2410.38,"location":2,"content":"the the proxy data set and the"},{"from":2410.38,"to":2412.51,"location":2,"content":"corresponding social media feeds for for"},{"from":2412.51,"to":2414.58,"location":2,"content":"those individuals for the actual"},{"from":2414.58,"to":2420.46,"location":2,"content":"diagnosis and the state of the art in"},{"from":2420.46,"to":2423.58,"location":2,"content":"clinical medicine kind of until this"},{"from":2423.58,"to":2426.04,"location":2,"content":"work there's been more recently but is"},{"from":2426.04,"to":2428.35,"location":2,"content":"sort of this single task logistic"},{"from":2428.35,"to":2430.48,"location":2,"content":"regression logistic regression setup"},{"from":2430.48,"to":2432.19,"location":2,"content":"where you have some input features and"},{"from":2432.19,"to":2433.24,"location":2,"content":"then you're making some output"},{"from":2433.24,"to":2436.99,"location":2,"content":"predictions like true or false and you"},{"from":2436.99,"to":2438.91,"location":2,"content":"can add some layers and start making it"},{"from":2438.91,"to":2442.78,"location":2,"content":"deep learning which is much fancier you"},{"from":2442.78,"to":2445.45,"location":2,"content":"can have a bunch of tasks in order to do"},{"from":2445.45,"to":2447.67,"location":2,"content":"a bunch of logistic regression tasks for"},{"from":2447.67,"to":2450.43,"location":2,"content":"a clinical environment or you can use"},{"from":2450.43,"to":2452.89,"location":2,"content":"multi task learning which is taking a"},{"from":2452.89,"to":2454.54,"location":2,"content":"basic deep learning model and adding a"},{"from":2454.54,"to":2456.88,"location":2,"content":"bunch of heads to it predicted jointly"},{"from":2456.88,"to":2460.15,"location":2,"content":"at the same time and here we had a bunch"},{"from":2460.15,"to":2463.39,"location":2,"content":"of diagnosis data so we predicted things"},{"from":2463.39,"to":2465.85,"location":2,"content":"like depression anxiety post-traumatic"},{"from":2465.85,"to":2469.93,"location":2,"content":"stress disorder we also added in gender"},{"from":2469.93,"to":2471.94,"location":2,"content":"because this is something that the"},{"from":2471.94,"to":2474.37,"location":2,"content":"clinicians told us actually had some"},{"from":2474.37,"to":2475.57,"location":2,"content":"correlation with some of these"},{"from":2475.57,"to":2477.04,"location":2,"content":"conditions and that they actually used"},{"from":2477.04,"to":2479.11,"location":2,"content":"it in making decisions themselves for"},{"from":2479.11,"to":2481.26,"location":2,"content":"whether or not someone was likely to"},{"from":2481.26,"to":2485.44,"location":2,"content":"attempt suicide or not and this also"},{"from":2485.44,"to":2487.45,"location":2,"content":"used this idea of comorbidity so"},{"from":2487.45,"to":2489.61,"location":2,"content":"multitask learning is actually kind of"},{"from":2489.61,"to":2492.31,"location":2,"content":"perfect for comorbidity in clinical"},{"from":2492.31,"to":2495.22,"location":2,"content":"domains so comorbidity is and when you"},{"from":2495.22,"to":2496.63,"location":2,"content":"have one condition you're a lot more"},{"from":2496.63,"to":2499.51,"location":2,"content":"likely to have another so people who"},{"from":2499.51,"to":2501.1,"location":2,"content":"have post-traumatic stress disorder are"},{"from":2501.1,"to":2502.78,"location":2,"content":"much more likely to have depression and"},{"from":2502.78,"to":2505.42,"location":2,"content":"anxiety and depression and anxiety tend"},{"from":2505.42,"to":2506.59,"location":2,"content":"to be comorbid so"},{"from":2506.59,"to":2508.17,"location":2,"content":"people who have won often had the other"},{"from":2508.17,"to":2510.88,"location":2,"content":"so this points to the fact this points"},{"from":2510.88,"to":2512.44,"location":2,"content":"to the idea that perhaps there's some"},{"from":2512.44,"to":2514.36,"location":2,"content":"underlying representation that is"},{"from":2514.36,"to":2515.98,"location":2,"content":"similar across them that can be"},{"from":2515.98,"to":2518.2,"location":2,"content":"leveraged in a deep learning model with"},{"from":2518.2,"to":2521.5,"location":2,"content":"individual heads further specifying each"},{"from":2521.5,"to":2525.76,"location":2,"content":"of the different kinds of conditions and"},{"from":2525.76,"to":2527.65,"location":2,"content":"so what we found was that as we moved"},{"from":2527.65,"to":2529.99,"location":2,"content":"from logistic regression to the single"},{"from":2529.99,"to":2531.55,"location":2,"content":"task deep learning to the multi task"},{"from":2531.55,"to":2532.87,"location":2,"content":"deep learning we were able to get"},{"from":2532.87,"to":2534.94,"location":2,"content":"significantly better results and this"},{"from":2534.94,"to":2536.95,"location":2,"content":"was true both in the suicide risk case"},{"from":2536.95,"to":2539.47,"location":2,"content":"where we had a lot of data as well as"},{"from":2539.47,"to":2541.18,"location":2,"content":"the post traumatic stress disorder case"},{"from":2541.18,"to":2543.49,"location":2,"content":"where we had very little data the"},{"from":2543.49,"to":2545.23,"location":2,"content":"behavior here was a little bit different"},{"from":2545.23,"to":2548.29,"location":2,"content":"so going from logistic regression to"},{"from":2548.29,"to":2551.68,"location":2,"content":"single task deep learning when we had a"},{"from":2551.68,"to":2555.07,"location":2,"content":"lot of data as we did with the suicide"},{"from":2555.07,"to":2557.71,"location":2,"content":"risk had the single task deep learning"},{"from":2557.71,"to":2559.63,"location":2,"content":"model working better than the logistic"},{"from":2559.63,"to":2561.7,"location":2,"content":"regression model but when we had very"},{"from":2561.7,"to":2563.89,"location":2,"content":"few instances this is where the deep"},{"from":2563.89,"to":2565.48,"location":2,"content":"learning models really struggled a lot"},{"from":2565.48,"to":2568.75,"location":2,"content":"more and so the logistic regression"},{"from":2568.75,"to":2570.55,"location":2,"content":"models were actually much better but"},{"from":2570.55,"to":2572.11,"location":2,"content":"once we started adding heads for the"},{"from":2572.11,"to":2573.76,"location":2,"content":"comorbid different kinds of conditions"},{"from":2573.76,"to":2576.04,"location":2,"content":"the different kinds of tasks and that"},{"from":2576.04,"to":2577.93,"location":2,"content":"related to you know whether or not the"},{"from":2577.93,"to":2579.46,"location":2,"content":"person might be committing suicide and"},{"from":2579.46,"to":2582.1,"location":2,"content":"we were able to bump the accuracy way"},{"from":2582.1,"to":2586.21,"location":2,"content":"back up again and you know it's roughly"},{"from":2586.21,"to":2588.31,"location":2,"content":"120 at-risk individuals that we were"},{"from":2588.31,"to":2590.41,"location":2,"content":"able to collect in the suicide case that"},{"from":2590.41,"to":2591.76,"location":2,"content":"we wouldn't have otherwise been able to"},{"from":2591.76,"to":2597.94,"location":2,"content":"to notice as being at risk one of the"},{"from":2597.94,"to":2599.86,"location":2,"content":"approaches we took in this was to"},{"from":2599.86,"to":2601.87,"location":2,"content":"contextualize and consider the ethical"},{"from":2601.87,"to":2603.82,"location":2,"content":"dimensions of releasing this kind of"},{"from":2603.82,"to":2607.03,"location":2,"content":"technology so it's really common in NLP"},{"from":2607.03,"to":2609.82,"location":2,"content":"papers to give examples but this was an"},{"from":2609.82,"to":2611.29,"location":2,"content":"area where we decided that giving"},{"from":2611.29,"to":2613.15,"location":2,"content":"examples of like depressed language"},{"from":2613.15,"to":2615.43,"location":2,"content":"could be used to discriminate against"},{"from":2615.43,"to":2618.01,"location":2,"content":"people like add you know job interviews"},{"from":2618.01,"to":2619.93,"location":2,"content":"or something like that you know the sort"},{"from":2619.93,"to":2622.54,"location":2,"content":"of armchair psychology approach so we"},{"from":2622.54,"to":2624.1,"location":2,"content":"decided that while it was important to"},{"from":2624.1,"to":2625.99,"location":2,"content":"talk about the technique and the utility"},{"from":2625.99,"to":2627.64,"location":2,"content":"of multitask learning in a clinical"},{"from":2627.64,"to":2630.22,"location":2,"content":"domain and for bringing an inclusion of"},{"from":2630.22,"to":2632.47,"location":2,"content":"underrepresented subgroups it had to be"},{"from":2632.47,"to":2633.85,"location":2,"content":"balanced with the fact that there was a"},{"from":2633.85,"to":2636.82,"location":2,"content":"lot of risk in talking about depression"},{"from":2636.82,"to":2638.68,"location":2,"content":"and anxiety and how those kinds of"},{"from":2638.68,"to":2639.91,"location":2,"content":"things could be predicted"},{"from":2639.91,"to":2642.1,"location":2,"content":"so we tried to take a more balanced"},{"from":2642.1,"to":2644.35,"location":2,"content":"approach here and since then I've been"},{"from":2644.35,"to":2646.21,"location":2,"content":"putting ethical considerations in all of"},{"from":2646.21,"to":2648.4,"location":2,"content":"my papers it's becoming more and more"},{"from":2648.4,"to":2653.2,"location":2,"content":"common actually so another kind of"},{"from":2653.2,"to":2655.06,"location":2,"content":"approach that's now turning this on its"},{"from":2655.06,"to":2657.37,"location":2,"content":"head or you're trying to remove some"},{"from":2657.37,"to":2660.24,"location":2,"content":"effect mitigate bias in some way is"},{"from":2660.24,"to":2662.8,"location":2,"content":"adversarial multitask learning so I just"},{"from":2662.8,"to":2664.12,"location":2,"content":"talked about multitask learning now"},{"from":2664.12,"to":2665.64,"location":2,"content":"let's talk about the adversarial case"},{"from":2665.64,"to":2668.68,"location":2,"content":"and the idea in the adversarial case is"},{"from":2668.68,"to":2671.11,"location":2,"content":"that you have a few heads and one is"},{"from":2671.11,"to":2672.85,"location":2,"content":"predicting the main task and the other"},{"from":2672.85,"to":2674.53,"location":2,"content":"one is predicting the thing that you"},{"from":2674.53,"to":2677.38,"location":2,"content":"don't want to be affecting your models"},{"from":2677.38,"to":2679.69,"location":2,"content":"predictions so for example something"},{"from":2679.69,"to":2681.37,"location":2,"content":"like whether or not someone should be"},{"from":2681.37,"to":2683.56,"location":2,"content":"promoted based on you know their"},{"from":2683.56,"to":2685.84,"location":2,"content":"performance reviews and things like that"},{"from":2685.84,"to":2688.57,"location":2,"content":"and you don't want that to be affected"},{"from":2688.57,"to":2690.76,"location":2,"content":"by those gender ideally gender is"},{"from":2690.76,"to":2693.1,"location":2,"content":"independent of a promotion decision and"},{"from":2693.1,"to":2696.25,"location":2,"content":"so you can you can create a model for"},{"from":2696.25,"to":2698.17,"location":2,"content":"this that actually puts that"},{"from":2698.17,"to":2701.41,"location":2,"content":"independence criteria in place by saying"},{"from":2701.41,"to":2704.8,"location":2,"content":"I want to minimize my loss on the"},{"from":2704.8,"to":2707.05,"location":2,"content":"promotion while maximizing my loss on"},{"from":2707.05,"to":2709.03,"location":2,"content":"the gender and so how we're doing that"},{"from":2709.03,"to":2710.65,"location":2,"content":"is just predicting gender and then"},{"from":2710.65,"to":2713.44,"location":2,"content":"negating the gradient so removing the"},{"from":2713.44,"to":2716.53,"location":2,"content":"effect of that single it's this is"},{"from":2716.53,"to":2718.45,"location":2,"content":"another adversary approach so you might"},{"from":2718.45,"to":2720.04,"location":2,"content":"have been familiar with like generative"},{"from":2720.04,"to":2721.96,"location":2,"content":"adversarial networks so this is like two"},{"from":2721.96,"to":2724.62,"location":2,"content":"discriminators two different task heads"},{"from":2724.62,"to":2727.27,"location":2,"content":"where one is trying to do the task that"},{"from":2727.27,"to":2729.13,"location":2,"content":"we care about and the other one is"},{"from":2729.13,"to":2731.47,"location":2,"content":"removing the signal that we really don't"},{"from":2731.47,"to":2734.62,"location":2,"content":"want to be coming into play in our"},{"from":2734.62,"to":2737.02,"location":2,"content":"downstream predictions so this is a way"},{"from":2737.02,"to":2739.75,"location":2,"content":"of kind of putting this into practice so"},{"from":2739.75,"to":2741.97,"location":2,"content":"the probability of your output predicted"},{"from":2741.97,"to":2744.31,"location":2,"content":"output given the ground truth and you're"},{"from":2744.31,"to":2747.16,"location":2,"content":"sensitive attribute like gender is equal"},{"from":2747.16,"to":2749.29,"location":2,"content":"across all the different sensitive"},{"from":2749.29,"to":2750.7,"location":2,"content":"attributes or equal across all the"},{"from":2750.7,"to":2753.76,"location":2,"content":"different genders and that's an example"},{"from":2753.76,"to":2755.41,"location":2,"content":"of equality of opportunity and"},{"from":2755.41,"to":2757,"location":2,"content":"supervised learning being put into"},{"from":2757,"to":2758.92,"location":2,"content":"practice so this is one of the key"},{"from":2758.92,"to":2761.31,"location":2,"content":"fairness definitions it's equivalent to"},{"from":2761.31,"to":2763.78,"location":2,"content":"equal recall across different subgroups"},{"from":2763.78,"to":2766.15,"location":2,"content":"as I mentioned earlier and that's a"},{"from":2766.15,"to":2768.46,"location":2,"content":"model that will actually implement that"},{"from":2768.46,"to":2770.89,"location":2,"content":"or help you achieve that where you're"},{"from":2770.89,"to":2772.21,"location":2,"content":"saying that a classifiers output"},{"from":2772.21,"to":2773.62,"location":2,"content":"decision should be the same"},{"from":2773.62,"to":2776.32,"location":2,"content":"sensitive characteristics given what the"},{"from":2776.32,"to":2781.42,"location":2,"content":"what the correct decision should be okay"},{"from":2781.42,"to":2787.96,"location":2,"content":"so how are we on time there any"},{"from":2787.96,"to":2788.74,"location":2,"content":"questions so far"},{"from":2788.74,"to":2792.52,"location":2,"content":"we good okay cool so I'm gonna go into a"},{"from":2792.52,"to":2793.57,"location":2,"content":"little bit of a case study now and"},{"from":2793.57,"to":2796.6,"location":2,"content":"end-to-end system that Google has been"},{"from":2796.6,"to":2798.37,"location":2,"content":"working on my colleagues have been"},{"from":2798.37,"to":2800.83,"location":2,"content":"working on that is an NLP domain and"},{"from":2800.83,"to":2804.43,"location":2,"content":"deals with some of these bias issues so"},{"from":2804.43,"to":2806.77,"location":2,"content":"you can find out more about this work in"},{"from":2806.77,"to":2810.01,"location":2,"content":"papers at AI a s and 2018 and fat star"},{"from":2810.01,"to":2812.98,"location":2,"content":"tutorial 2019 called measuring and"},{"from":2812.98,"to":2814.72,"location":2,"content":"mitigating unintended bias and text"},{"from":2814.72,"to":2818.14,"location":2,"content":"classification and this came out of"},{"from":2818.14,"to":2821.89,"location":2,"content":"conversation AI which is a which is a"},{"from":2821.89,"to":2825.82,"location":2,"content":"product that's like it's part of this"},{"from":2825.82,"to":2828.19,"location":2,"content":"it's called a bet at Google it's a kind"},{"from":2828.19,"to":2830.74,"location":2,"content":"of spin-off company called jigsaw that"},{"from":2830.74,"to":2833.44,"location":2,"content":"focuses on trying to like combat abuse"},{"from":2833.44,"to":2836.83,"location":2,"content":"online and the conversation AI team is"},{"from":2836.83,"to":2838.27,"location":2,"content":"trying to use deep learning to improve"},{"from":2838.27,"to":2842.38,"location":2,"content":"online conversations and collaborate"},{"from":2842.38,"to":2844.27,"location":2,"content":"with a ton of different different people"},{"from":2844.27,"to":2847.93,"location":2,"content":"to do that so how this works is oh you"},{"from":2847.93,"to":2849.7,"location":2,"content":"can try it out to on perspective API"},{"from":2849.7,"to":2852.61,"location":2,"content":"comm so given some phrase like you're a"},{"from":2852.61,"to":2855.67,"location":2,"content":"dork it puts out a toxicity score"},{"from":2855.67,"to":2860.7,"location":2,"content":"associated to that like point nine one"},{"from":2860.7,"to":2863.23,"location":2,"content":"and the model starts sort of falsely"},{"from":2863.23,"to":2865.06,"location":2,"content":"associating frequently attacked"},{"from":2865.06,"to":2867.91,"location":2,"content":"identities with toxicity so this is a"},{"from":2867.91,"to":2871,"location":2,"content":"kind of false positive bias so I'm a"},{"from":2871,"to":2873.88,"location":2,"content":"proud tall person gets a model toxicity"},{"from":2873.88,"to":2877.78,"location":2,"content":"score of 0.18 I'm a proud gay person"},{"from":2877.78,"to":2880.81,"location":2,"content":"gets a toxicity model score of 0.69 and"},{"from":2880.81,"to":2883.84,"location":2,"content":"this is because these the term DEA tends"},{"from":2883.84,"to":2886.54,"location":2,"content":"to be used in really toxic situations"},{"from":2886.54,"to":2888.97,"location":2,"content":"and so the model starts to learn that"},{"from":2888.97,"to":2891.49,"location":2,"content":"gay itself is toxic but that's not"},{"from":2891.49,"to":2893.2,"location":2,"content":"actually what we want and we don't want"},{"from":2893.2,"to":2894.82,"location":2,"content":"these kinds of predictions coming out of"},{"from":2894.82,"to":2900.61,"location":2,"content":"the model so the bias is largely caused"},{"from":2900.61,"to":2903.01,"location":2,"content":"here by the data set imbalance again"},{"from":2903.01,"to":2904.75,"location":2,"content":"this is data kind of coming and running"},{"from":2904.75,"to":2905.62,"location":2,"content":"it's had again"},{"from":2905.62,"to":2908.62,"location":2,"content":"and so frequently attacks identities are"},{"from":2908.62,"to":2910.72,"location":2,"content":"really over-represented in toxic"},{"from":2910.72,"to":2912.22,"location":2,"content":"comments there's a lot of toxicity"},{"from":2912.22,"to":2915.52,"location":2,"content":"towards lbgtq identities it's really"},{"from":2915.52,"to":2917.17,"location":2,"content":"horrible to work on this stuff that like"},{"from":2917.17,"to":2921.18,"location":2,"content":"really can really affect you personally"},{"from":2921.18,"to":2924.34,"location":2,"content":"and one of the approaches that the team"},{"from":2924.34,"to":2927.31,"location":2,"content":"took was just to add non-toxic data from"},{"from":2927.31,"to":2930.04,"location":2,"content":"Wikipedia so helping to helping the"},{"from":2930.04,"to":2932.38,"location":2,"content":"model to understand that these kinds of"},{"from":2932.38,"to":2935.26,"location":2,"content":"terms can be used in you know more"},{"from":2935.26,"to":2942.07,"location":2,"content":"positive sorts of contexts one of the"},{"from":2942.07,"to":2944.62,"location":2,"content":"challenges with measuring how well the"},{"from":2944.62,"to":2946.24,"location":2,"content":"system was doing is that there's not a"},{"from":2946.24,"to":2949.63,"location":2,"content":"really nice way to have controlled"},{"from":2949.63,"to":2952.78,"location":2,"content":"toxicity evaluation so in real world"},{"from":2952.78,"to":2954.73,"location":2,"content":"conversation it can be kind of anyone's"},{"from":2954.73,"to":2958,"location":2,"content":"guess what the toxicity is of a specific"},{"from":2958,"to":2960.16,"location":2,"content":"sentence if you really want to control"},{"from":2960.16,"to":2962.46,"location":2,"content":"for different kind of subgroups or"},{"from":2962.46,"to":2964.39,"location":2,"content":"intersectional subgroups and it can be"},{"from":2964.39,"to":2967.06,"location":2,"content":"even harder to get real good data to"},{"from":2967.06,"to":2969.61,"location":2,"content":"evaluate properly so what the team ended"},{"from":2969.61,"to":2971.53,"location":2,"content":"up doing was developing a synthetic data"},{"from":2971.53,"to":2974.02,"location":2,"content":"approach so this is kind of like a bias"},{"from":2974.02,"to":2976.63,"location":2,"content":"mad libs where you take template"},{"from":2976.63,"to":2978.79,"location":2,"content":"sentences and you use those for"},{"from":2978.79,"to":2981.79,"location":2,"content":"evaluation this is a kind of evaluation"},{"from":2981.79,"to":2983.98,"location":2,"content":"you'd want to use in addition to your"},{"from":2983.98,"to":2987.88,"location":2,"content":"target downstream kind of data set but"},{"from":2987.88,"to":2990.37,"location":2,"content":"this helps you get at the biases"},{"from":2990.37,"to":2993.64,"location":2,"content":"specifically so some template phrase"},{"from":2993.64,"to":2996.13,"location":2,"content":"like I'm a proud blank person and then"},{"from":2996.13,"to":2998.02,"location":2,"content":"filling in different subgroup identities"},{"from":2998.02,"to":3000.45,"location":2,"content":"and you don't want to release a model"},{"from":3000.45,"to":3002.46,"location":2,"content":"unless you see that the scores across"},{"from":3002.46,"to":3005.61,"location":2,"content":"these different kinds of these different"},{"from":3005.61,"to":3007.08,"location":2,"content":"kinds of template sentences with"},{"from":3007.08,"to":3009.72,"location":2,"content":"synthetic the synthetic template"},{"from":3009.72,"to":3012.3,"location":2,"content":"sentences are relatively kind of the"},{"from":3012.3,"to":3015.09,"location":2,"content":"same across yeah all of the different"},{"from":3015.09,"to":3021.57,"location":2,"content":"model runs cool so some assumptions that"},{"from":3021.57,"to":3025.19,"location":2,"content":"they made in this was that the data set"},{"from":3025.19,"to":3027.75,"location":2,"content":"didn't have annotated bias and they"},{"from":3027.75,"to":3029.43,"location":2,"content":"didn't do any causal analysis because"},{"from":3029.43,"to":3030.48,"location":2,"content":"they were just trying to focus in"},{"from":3030.48,"to":3034.76,"location":2,"content":"particular on this toxicity problem and"},{"from":3034.76,"to":3038.88,"location":2,"content":"they used a CNN convolutional yeah you"},{"from":3038.88,"to":3040.14,"location":2,"content":"guys know"},{"from":3040.14,"to":3041.73,"location":2,"content":"with pre-trained glove embeddings this"},{"from":3041.73,"to":3042.84,"location":2,"content":"is probably like your bread and butter"},{"from":3042.84,"to":3044.46,"location":2,"content":"fruit and gloves embeddings I'm sure you"},{"from":3044.46,"to":3046.14,"location":2,"content":"know all about this and we're Tyvek cool"},{"from":3046.14,"to":3050.24,"location":2,"content":"curious implementation of this and and"},{"from":3050.24,"to":3052.5,"location":2,"content":"using these kind of data augmentation"},{"from":3052.5,"to":3055.71,"location":2,"content":"approaches both a Wikipedia kind of"},{"from":3055.71,"to":3057.9,"location":2,"content":"approach as well as actually collecting"},{"from":3057.9,"to":3060.6,"location":2,"content":"positive statements about LGBTQ identity"},{"from":3060.6,"to":3062.58,"location":2,"content":"so there's this project called Project"},{"from":3062.58,"to":3065.1,"location":2,"content":"respected Google where we go out and and"},{"from":3065.1,"to":3067.14,"location":2,"content":"talk to to people who identify as queer"},{"from":3067.14,"to":3069,"location":2,"content":"or people who have friends who do and"},{"from":3069,"to":3070.98,"location":2,"content":"like talk about this in a positive way"},{"from":3070.98,"to":3073.83,"location":2,"content":"and we add this as data so we can"},{"from":3073.83,"to":3075.66,"location":2,"content":"actually know that this is can be a"},{"from":3075.66,"to":3079.47,"location":2,"content":"positive thing and in order to measure"},{"from":3079.47,"to":3082.08,"location":2,"content":"the model performance here again it's"},{"from":3082.08,"to":3083.76,"location":2,"content":"looking at the differences across"},{"from":3083.76,"to":3085.47,"location":2,"content":"different subgroups and trying to"},{"from":3085.47,"to":3087.78,"location":2,"content":"compare also the subgroup performance to"},{"from":3087.78,"to":3089.94,"location":2,"content":"some sort of general distribution so"},{"from":3089.94,"to":3092.73,"location":2,"content":"here they use a UC where a UC is"},{"from":3092.73,"to":3094.44,"location":2,"content":"essentially the probability that a model"},{"from":3094.44,"to":3097.17,"location":2,"content":"will give a randomly selected positive"},{"from":3097.17,"to":3099.66,"location":2,"content":"example a higher score than a randomly"},{"from":3099.66,"to":3103.23,"location":2,"content":"selected a negative example so here you"},{"from":3103.23,"to":3105.03,"location":2,"content":"can see some toxic comments and non"},{"from":3105.03,"to":3107.52,"location":2,"content":"toxic comments with that example sort of"},{"from":3107.52,"to":3112.35,"location":2,"content":"low a you see here this is a example"},{"from":3112.35,"to":3114.57,"location":2,"content":"with a high AUC so the model is doing a"},{"from":3114.57,"to":3116.25,"location":2,"content":"relatively good job separating these two"},{"from":3116.25,"to":3119.94,"location":2,"content":"kinds of comments and there are"},{"from":3119.94,"to":3121.23,"location":2,"content":"different kinds of biases that they've"},{"from":3121.23,"to":3124.02,"location":2,"content":"defined in this work so low subgroup"},{"from":3124.02,"to":3125.46,"location":2,"content":"performance means that the model"},{"from":3125.46,"to":3127.38,"location":2,"content":"performs worse on subgroup comments than"},{"from":3127.38,"to":3129.93,"location":2,"content":"it does on comments overall and the"},{"from":3129.93,"to":3131.31,"location":2,"content":"metric they've introduced to measure"},{"from":3131.31,"to":3134.84,"location":2,"content":"this is called subgroup a you see"},{"from":3134.84,"to":3137.67,"location":2,"content":"another one is subgroup shift and that's"},{"from":3137.67,"to":3139.17,"location":2,"content":"when the model systematically scores"},{"from":3139.17,"to":3142.68,"location":2,"content":"comments from some subgroup higher so"},{"from":3142.68,"to":3144.87,"location":2,"content":"this is sort of like to the right and"},{"from":3144.87,"to":3147.3,"location":2,"content":"then there's also this background"},{"from":3147.3,"to":3149.55,"location":2,"content":"positive subgroup negatives shifting to"},{"from":3149.55,"to":3156.33,"location":2,"content":"the left yeah yeah this sort of saying"},{"from":3156.33,"to":3157.77,"location":2,"content":"when I said it can go either way to the"},{"from":3157.77,"to":3159.12,"location":2,"content":"right or the left and there's just kind"},{"from":3159.12,"to":3160.95,"location":2,"content":"of different metrics that can define"},{"from":3160.95,"to":3166.87,"location":2,"content":"each of these and the results in this"},{"from":3166.87,"to":3168.88,"location":2,"content":"sort of going through not only just"},{"from":3168.88,"to":3171.01,"location":2,"content":"looking at you know qualitative examples"},{"from":3171.01,"to":3173.74,"location":2,"content":"and general evaluation metrics but also"},{"from":3173.74,"to":3175.48,"location":2,"content":"focusing in on some of the key metrics"},{"from":3175.48,"to":3177.31,"location":2,"content":"defined for this work these sort of AUC"},{"from":3177.31,"to":3179.32,"location":2,"content":"based approaches and they were able to"},{"from":3179.32,"to":3181.09,"location":2,"content":"see significant differences in the"},{"from":3181.09,"to":3183.22,"location":2,"content":"original release which didn't account"},{"from":3183.22,"to":3185.11,"location":2,"content":"for any of these unintended biases and"},{"from":3185.11,"to":3187.63,"location":2,"content":"downstream releases which did which"},{"from":3187.63,"to":3189.85,"location":2,"content":"incorporated this kind of normative data"},{"from":3189.85,"to":3192.01,"location":2,"content":"that said the sort of things that we"},{"from":3192.01,"to":3195.63,"location":2,"content":"thought the model should be learning"},{"from":3195.63,"to":3199.36,"location":2,"content":"cool so the last thing to keep in mind"},{"from":3199.36,"to":3201.58,"location":2,"content":"as you sort of develop and work towards"},{"from":3201.58,"to":3204.43,"location":2,"content":"a creating deeper better models is to"},{"from":3204.43,"to":3207.46,"location":2,"content":"release responsibly so this is a project"},{"from":3207.46,"to":3208.72,"location":2,"content":"I've been working on with a ton of"},{"from":3208.72,"to":3210.49,"location":2,"content":"different people called model cards for"},{"from":3210.49,"to":3212.89,"location":2,"content":"model reporting it's a it's a little bit"},{"from":3212.89,"to":3215.17,"location":2,"content":"of like the next step after data sheets"},{"from":3215.17,"to":3218.47,"location":2,"content":"for datasets where data sheets for"},{"from":3218.47,"to":3220.63,"location":2,"content":"datasets focus on information about the"},{"from":3220.63,"to":3223.27,"location":2,"content":"data model cards for model reporting"},{"from":3223.27,"to":3225.27,"location":2,"content":"focuses on information about the model"},{"from":3225.27,"to":3228.52,"location":2,"content":"so it captures what it does how it works"},{"from":3228.52,"to":3232.27,"location":2,"content":"why it matters and one of the key ideas"},{"from":3232.27,"to":3234.52,"location":2,"content":"here is disaggregated and intersectional"},{"from":3234.52,"to":3237.85,"location":2,"content":"evaluation so it's not enough anymore"},{"from":3237.85,"to":3239.62,"location":2,"content":"to put out human centered technology"},{"from":3239.62,"to":3242.68,"location":2,"content":"that just has some vague overall score"},{"from":3242.68,"to":3244.12,"location":2,"content":"associated to it"},{"from":3244.12,"to":3245.71,"location":2,"content":"you actually need to understand how it"},{"from":3245.71,"to":3247.06,"location":2,"content":"works across different subpopulations"},{"from":3247.06,"to":3249.49,"location":2,"content":"and you have to understand what the data"},{"from":3249.49,"to":3252.94,"location":2,"content":"is telling you that um so here's some"},{"from":3252.94,"to":3255.22,"location":2,"content":"example details that model card would"},{"from":3255.22,"to":3257.65,"location":2,"content":"have who it's developed by what the"},{"from":3257.65,"to":3259.93,"location":2,"content":"intended uses so that it doesn't start"},{"from":3259.93,"to":3261.4,"location":2,"content":"being used in ways that it's not"},{"from":3261.4,"to":3264.04,"location":2,"content":"intended to be used the factors that are"},{"from":3264.04,"to":3265.87,"location":2,"content":"likely to be affected by"},{"from":3265.87,"to":3267.55,"location":2,"content":"disproportionate performance of the"},{"from":3267.55,"to":3270.04,"location":2,"content":"model so different kinds of identity"},{"from":3270.04,"to":3273.16,"location":2,"content":"groups things like that the metrics that"},{"from":3273.16,"to":3275.41,"location":2,"content":"that you're deciding to use in order to"},{"from":3275.41,"to":3277.12,"location":2,"content":"understand the fairness of the model or"},{"from":3277.12,"to":3279.43,"location":2,"content":"the different performance of the model"},{"from":3279.43,"to":3281.08,"location":2,"content":"across different kinds of subgroups and"},{"from":3281.08,"to":3283.66,"location":2,"content":"factors information about the evaluation"},{"from":3283.66,"to":3287.23,"location":2,"content":"data and training data as well as"},{"from":3287.23,"to":3289.9,"location":2,"content":"ethical considerations so what were some"},{"from":3289.9,"to":3291.64,"location":2,"content":"of the things you took into account or"},{"from":3291.64,"to":3293.55,"location":2,"content":"what are some of the risks and benefits"},{"from":3293.55,"to":3297.25,"location":2,"content":"that that are relevant to this model and"},{"from":3297.25,"to":3299.2,"location":2,"content":"additional caveats and recommendations"},{"from":3299.2,"to":3300.46,"location":2,"content":"so for example"},{"from":3300.46,"to":3302.89,"location":2,"content":"in the conversation the eye case they're"},{"from":3302.89,"to":3304.72,"location":2,"content":"working with synthetic data so this is"},{"from":3304.72,"to":3306.64,"location":2,"content":"the sort of limitation of the evaluation"},{"from":3306.64,"to":3308.95,"location":2,"content":"that's important to understand because"},{"from":3308.95,"to":3310.87,"location":2,"content":"it can tell you a lot about the biases"},{"from":3310.87,"to":3312.49,"location":2,"content":"but doesn't tell you a lot about how it"},{"from":3312.49,"to":3318.01,"location":2,"content":"works generally and then the key"},{"from":3318.01,"to":3320.56,"location":2,"content":"component in the quantitative section of"},{"from":3320.56,"to":3322.06,"location":2,"content":"the model card is to have this both"},{"from":3322.06,"to":3323.74,"location":2,"content":"intersectional and disaggregated"},{"from":3323.74,"to":3325.81,"location":2,"content":"evaluation and from here you trivially"},{"from":3325.81,"to":3327.34,"location":2,"content":"get two different kinds of fairness"},{"from":3327.34,"to":3329.8,"location":2,"content":"definitions the closer you get to parity"},{"from":3329.8,"to":3331.57,"location":2,"content":"across subgroups the closer you're"},{"from":3331.57,"to":3332.71,"location":2,"content":"getting to something that is"},{"from":3332.71,"to":3336.94,"location":2,"content":"mathematically fair okay so hopefully by"},{"from":3336.94,"to":3338.2,"location":2,"content":"paying attention to these kinds of"},{"from":3338.2,"to":3340.36,"location":2,"content":"approaches taking into account all these"},{"from":3340.36,"to":3341.8,"location":2,"content":"kinds of things we can move from"},{"from":3341.8,"to":3344.14,"location":2,"content":"majority representation of data in our"},{"from":3344.14,"to":3346.24,"location":2,"content":"models to something more like diverse"},{"from":3346.24,"to":3349.99,"location":2,"content":"representation for more ethical AI okay"},{"from":3349.99,"to":3350.77,"location":2,"content":"that's it"},{"from":3350.77,"to":3352.2,"location":2,"content":"Thanks"},{"from":3352.2,"to":3359.89,"location":2,"content":"[Applause]"}]} \ No newline at end of file diff --git a/bcc-en/2.bcc b/bcc-en/2.bcc new file mode 100644 index 0000000000000000000000000000000000000000..6c9d4a0cf440cb336d92fc7ff8111e33ee1a8fc3 --- /dev/null +++ b/bcc-en/2.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.59,"to":8.22,"location":2,"content":"okay hello everyone and welcome back to"},{"from":8.22,"to":13.5,"location":2,"content":"the second class of cs2 24in okay so"},{"from":13.5,"to":16.11,"location":2,"content":"right at the end of last time I was just"},{"from":16.11,"to":19.41,"location":2,"content":"showing you a little from this ipython"},{"from":19.41,"to":21.09,"location":2,"content":"notebook of things that you could do"},{"from":21.09,"to":22.95,"location":2,"content":"with word vectors but I kind of ran out"},{"from":22.95,"to":26.46,"location":2,"content":"of time a little bit so I'll just spend"},{"from":26.46,"to":28.35,"location":2,"content":"a couple of more minutes first I'm"},{"from":28.35,"to":30.45,"location":2,"content":"showing the end of this I stuck this"},{"from":30.45,"to":32.88,"location":2,"content":"ipython notebook up on the course page"},{"from":32.88,"to":35.31,"location":2,"content":"so under lecture 1 you can find a copy"},{"from":35.31,"to":38.01,"location":2,"content":"of it and you can download it so I both"},{"from":38.01,"to":40.44,"location":2,"content":"stuck up just an HTML version of it and"},{"from":40.44,"to":43.68,"location":2,"content":"a zip file like HTML file is only good"},{"from":43.68,"to":45.24,"location":2,"content":"to look at you can't do anything with it"},{"from":45.24,"to":46.92,"location":2,"content":"so you want to if you want to play with"},{"from":46.92,"to":49.53,"location":2,"content":"it by yourself download the zip file and"},{"from":49.53,"to":52.53,"location":2,"content":"get the ipython notebook out of that ok"},{"from":52.53,"to":54.45,"location":2,"content":"so we were looking at these glove word"},{"from":54.45,"to":56.19,"location":2,"content":"vectors which I'll talk about a bit more"},{"from":56.19,"to":58.44,"location":2,"content":"today and so there are these sort of"},{"from":58.44,"to":61.26,"location":2,"content":"basic results of similarity in this"},{"from":61.26,"to":65.24,"location":2,"content":"vector space worked very nicely for"},{"from":65.24,"to":69.87,"location":2,"content":"discovering similar words and then going"},{"from":69.87,"to":71.46,"location":2,"content":"on from that there was this idea that"},{"from":71.46,"to":73.47,"location":2,"content":"we'll spend some more time on today"},{"from":73.47,"to":77.67,"location":2,"content":"which was maybe this vector space is not"},{"from":77.67,"to":80.4,"location":2,"content":"only a similarity space we're close"},{"from":80.4,"to":83.19,"location":2,"content":"together things have similar meaning but"},{"from":83.19,"to":85.95,"location":2,"content":"it actually captures meaning and a"},{"from":85.95,"to":87.78,"location":2,"content":"considerably deeper and more profound"},{"from":87.78,"to":90.78,"location":2,"content":"way which is to say that there are"},{"from":90.78,"to":93.84,"location":2,"content":"actually directions in the space that"},{"from":93.84,"to":96.51,"location":2,"content":"you can point which have a certain"},{"from":96.51,"to":98.97,"location":2,"content":"meaning so that if you're pointing in"},{"from":98.97,"to":102.93,"location":2,"content":"one direction it means this is more so"},{"from":102.93,"to":104.85,"location":2,"content":"the case if you're pointing in a"},{"from":104.85,"to":106.71,"location":2,"content":"different direction in the meaning space"},{"from":106.71,"to":109.29,"location":2,"content":"it might be this is the capital of this"},{"from":109.29,"to":111.72,"location":2,"content":"country or all sorts of different"},{"from":111.72,"to":113.7,"location":2,"content":"meanings could be in code in the space"},{"from":113.7,"to":117.45,"location":2,"content":"and a way of testing that is to use"},{"from":117.45,"to":120.21,"location":2,"content":"these analogy problems and I quickly"},{"from":120.21,"to":122.58,"location":2,"content":"show this at the end but just to make"},{"from":122.58,"to":124.35,"location":2,"content":"sure everyone got it since it's sort of"},{"from":124.35,"to":127.14,"location":2,"content":"it's sort of a clever thing right so the"},{"from":127.14,"to":130.62,"location":2,"content":"idea is that we're going to start with a"},{"from":130.62,"to":135.18,"location":2,"content":"pair of words like King and man and so"},{"from":135.18,"to":136.71,"location":2,"content":"what we're going to do is we're going to"},{"from":136.71,"to":137.98,"location":2,"content":"say well there's a vector"},{"from":137.98,"to":141.46,"location":2,"content":"King in the space and there's a vector"},{"from":141.46,"to":145.24,"location":2,"content":"for man in the space and but what we're"},{"from":145.24,"to":147.52,"location":2,"content":"going to do is we're going to subtract"},{"from":147.52,"to":149.65,"location":2,"content":"as in just good old vector subtraction"},{"from":149.65,"to":151.62,"location":2,"content":"that you hopefully learned in your"},{"from":151.62,"to":154.27,"location":2,"content":"linear algebra class we're going to"},{"from":154.27,"to":156.34,"location":2,"content":"subtract the man vector from the king"},{"from":156.34,"to":158.8,"location":2,"content":"vector and the idea we have in our head"},{"from":158.8,"to":162.04,"location":2,"content":"then is if we do that what will happens"},{"from":162.04,"to":164.68,"location":2,"content":"we'll be left with the meaning of"},{"from":164.68,"to":170.26,"location":2,"content":"kingship without the madness and so then"},{"from":170.26,"to":173.71,"location":2,"content":"there's also a director - a vector for"},{"from":173.71,"to":176.68,"location":2,"content":"woman so we can add the woman vector to"},{"from":176.68,"to":178.93,"location":2,"content":"that resulting vector and then we could"},{"from":178.93,"to":181.42,"location":2,"content":"say well in the vector we end up at some"},{"from":181.42,"to":183.49,"location":2,"content":"point in the vector space and then we're"},{"from":183.49,"to":185.41,"location":2,"content":"going to say well what's the closest"},{"from":185.41,"to":187.99,"location":2,"content":"word that you can find to here and it's"},{"from":187.99,"to":190.03,"location":2,"content":"going to print out the closest word and"},{"from":190.03,"to":196.48,"location":2,"content":"as we saw last time lo and behold if you"},{"from":196.48,"to":200.71,"location":2,"content":"do that you get the answer and I'm"},{"from":200.71,"to":219.73,"location":2,"content":"saying you get King Nancy no wait I have"},{"from":219.73,"to":222.4,"location":2,"content":"to a first King and I sha sha sha sorry"},{"from":222.4,"to":225.64,"location":2,"content":"whoops yeah okay I kind of do it well"},{"from":225.64,"to":234.76,"location":2,"content":"like man King okay okay yeah that's"},{"from":234.76,"to":237.28,"location":2,"content":"right sorry okay yeah cuz it should be"},{"from":237.28,"to":239.71,"location":2,"content":"man as the King as woman as to something"},{"from":239.71,"to":242.47,"location":2,"content":"sorry yeah I was getting my order of"},{"from":242.47,"to":246.61,"location":2,"content":"components wrong okay and you know as I"},{"from":246.61,"to":248.77,"location":2,"content":"was sort of I guess I was showing some"},{"from":248.77,"to":251.65,"location":2,"content":"examples last time with nationality"},{"from":251.65,"to":255.7,"location":2,"content":"words but I mean this in a way that is"},{"from":255.7,"to":258.7,"location":2,"content":"sort of surprising too shocking this"},{"from":258.7,"to":261.49,"location":2,"content":"actually works for all kinds of things"},{"from":261.49,"to":263.77,"location":2,"content":"that you can get meaning in this space"},{"from":263.77,"to":267.76,"location":2,"content":"so I can ask various kinds of analogies"},{"from":267.76,"to":270.58,"location":2,"content":"of looser sort so I can say Australia is"},{"from":270.58,"to":271.51,"location":2,"content":"to be"},{"from":271.51,"to":275.56,"location":2,"content":"as France is to wine you might think why"},{"from":275.56,"to":276.94,"location":2,"content":"and what it gives back is champagne"},{"from":276.94,"to":279.64,"location":2,"content":"which seems a pretty good answer okay"},{"from":279.64,"to":284.02,"location":2,"content":"with that you can do more syntactic"},{"from":284.02,"to":286.93,"location":2,"content":"facts so I can say tall tall tall as the"},{"from":286.93,"to":289.99,"location":2,"content":"tallest as long as to longest and it"},{"from":289.99,"to":294.31,"location":2,"content":"gets that I say good is to fantastic as"},{"from":294.31,"to":297.76,"location":2,"content":"bad is to terrible then it seems to get"},{"from":297.76,"to":300.1,"location":2,"content":"out that there's some kind of notion of"},{"from":300.1,"to":303.76,"location":2,"content":"make more extremes direction and get"},{"from":303.76,"to":305.82,"location":2,"content":"this direction out I skipped over one"},{"from":305.82,"to":310.17,"location":2,"content":"Obama is two Clinton as Reagan is two"},{"from":310.17,"to":312.55,"location":2,"content":"you may or may not like the answer it"},{"from":312.55,"to":315.94,"location":2,"content":"gives for this one as Obama is to this"},{"from":315.94,"to":318.88,"location":2,"content":"Reagan is to Nixon now one thing you"},{"from":318.88,"to":321.4,"location":2,"content":"might notice at this point and this is"},{"from":321.4,"to":322.96,"location":2,"content":"something I actually want to come back"},{"from":322.96,"to":325.69,"location":2,"content":"to at the end well there's this problem"},{"from":325.69,"to":328.03,"location":2,"content":"because Clinton's ambiguous right"},{"from":328.03,"to":331.66,"location":2,"content":"there's Bill or live Hillary and I"},{"from":331.66,"to":335.68,"location":2,"content":"forget you know so this data as I said"},{"from":335.68,"to":337.78,"location":2,"content":"is a few years old so this data was done"},{"from":337.78,"to":341.98,"location":2,"content":"in 2014 so in sort of in it definitely"},{"from":341.98,"to":343.39,"location":2,"content":"doesn't have Trump really in it as a"},{"from":343.39,"to":345.64,"location":2,"content":"politician but you know it would have"},{"from":345.64,"to":348.4,"location":2,"content":"variously both Clinton's but as sort of"},{"from":348.4,"to":351.43,"location":2,"content":"make sense of probably for sort of proof"},{"from":351.43,"to":354.37,"location":2,"content":"for 2014 data that Bill Clinton"},{"from":354.37,"to":356.8,"location":2,"content":"dominated so I think what we're getting"},{"from":356.8,"to":360.97,"location":2,"content":"out of this is that Clinton and Nixon"},{"from":360.97,"to":362.92,"location":2,"content":"are sort of similar of people in dangers"},{"from":362.92,"to":368.53,"location":2,"content":"of being impeached and on both sides of"},{"from":368.53,"to":370.75,"location":2,"content":"the aisle and of thinking primarily of"},{"from":370.75,"to":373.57,"location":2,"content":"Bill Clinton but if this sort of brings"},{"from":373.57,"to":375.37,"location":2,"content":"up something that I'll come back to"},{"from":375.37,"to":378.34,"location":2,"content":"right at the end of it sort of looks"},{"from":378.34,"to":380.16,"location":2,"content":"like we've got a sort of a problem here"},{"from":380.16,"to":382.69,"location":2,"content":"because we just have this string"},{"from":382.69,"to":387.06,"location":2,"content":"literally Clinton and that string is any"},{"from":387.06,"to":391.36,"location":2,"content":"possible sense and meaning of the string"},{"from":391.36,"to":396.58,"location":2,"content":"Clinton and so minimally that we have"},{"from":396.58,"to":398.71,"location":2,"content":"Bill Clinton and Hillary Clinton but you"},{"from":398.71,"to":399.88,"location":2,"content":"know maybe you have some friends there"},{"from":399.88,"to":401.59,"location":2,"content":"called Clinton as well right and they're"},{"from":401.59,"to":404.11,"location":2,"content":"all mixed together in this Clinton and"},{"from":404.11,"to":405.1,"location":2,"content":"so that seems kind"},{"from":405.1,"to":406.9,"location":2,"content":"problematic and that sort of been an"},{"from":406.9,"to":408.37,"location":2,"content":"issue that's been discussed some for"},{"from":408.37,"to":410.2,"location":2,"content":"these word vectors and I'll come back to"},{"from":410.2,"to":413.56,"location":2,"content":"that another thing you can do is you can"},{"from":413.56,"to":415.9,"location":2,"content":"give a set of words and say which is the"},{"from":415.9,"to":417.94,"location":2,"content":"odd one out may be used to do puzzles"},{"from":417.94,"to":419.82,"location":2,"content":"like that in middle school or something"},{"from":419.82,"to":422.5,"location":2,"content":"and so you can do that and it decides"},{"from":422.5,"to":424.3,"location":2,"content":"that cereal is the outline out of that"},{"from":424.3,"to":427.42,"location":2,"content":"set mm-hmm seems okay and then one other"},{"from":427.42,"to":430.9,"location":2,"content":"thing I'll just show you is so it sort"},{"from":430.9,"to":432.55,"location":2,"content":"of be nice to look at these words as"},{"from":432.55,"to":435.28,"location":2,"content":"I've drawn them in some of the slide"},{"from":435.28,"to":437.47,"location":2,"content":"pictures so this is saying to put"},{"from":437.47,"to":439.78,"location":2,"content":"together a PCEHR principal components"},{"from":439.78,"to":444.58,"location":2,"content":"analysis scatter plot so I can do that"},{"from":444.58,"to":447.34,"location":2,"content":"and then I can say give it a set of"},{"from":447.34,"to":451.06,"location":2,"content":"words and draw me these as a scatter"},{"from":451.06,"to":455.32,"location":2,"content":"plot and hopefully if I can just about"},{"from":455.32,"to":458.83,"location":2,"content":"fit it in here is my scatter plot and it"},{"from":458.83,"to":460.42,"location":2,"content":"works pretty well right I've got the"},{"from":460.42,"to":462.73,"location":2,"content":"wine champagne beer up here then the"},{"from":462.73,"to":465.58,"location":2,"content":"coffee and tea here are the country easy"},{"from":465.58,"to":467.44,"location":2,"content":"as the school's college institute"},{"from":467.44,"to":473.49,"location":2,"content":"universities the animals are down here"},{"from":473.49,"to":476.8,"location":2,"content":"food stuffs there so yeah this sort of"},{"from":476.8,"to":478.42,"location":2,"content":"really does work with this tube"},{"from":478.42,"to":480.4,"location":2,"content":"Direction dimensional display it"},{"from":480.4,"to":484.6,"location":2,"content":"basically shows you similarity now there"},{"from":484.6,"to":487.81,"location":2,"content":"are you know to some extent though you"},{"from":487.81,"to":489.43,"location":2,"content":"want to hold on to your wallet with"},{"from":489.43,"to":491.59,"location":2,"content":"these PCA displays as I've discussed"},{"from":491.59,"to":493.78,"location":2,"content":"before since you are taking something"},{"from":493.78,"to":495.7,"location":2,"content":"that was a hundred dimensional and we're"},{"from":495.7,"to":497.89,"location":2,"content":"just doing this 2d projection there's"},{"from":497.89,"to":500.62,"location":2,"content":"capturing some of the major geometry of"},{"from":500.62,"to":503.53,"location":2,"content":"the space but it just has to be losing a"},{"from":503.53,"to":505.69,"location":2,"content":"huge amount of the information so when"},{"from":505.69,"to":508.48,"location":2,"content":"things end up close together they might"},{"from":508.48,"to":510.19,"location":2,"content":"be really close together in the original"},{"from":510.19,"to":512.32,"location":2,"content":"space or they might just have been words"},{"from":512.32,"to":515.5,"location":2,"content":"that lost in the 2d projection because"},{"from":515.5,"to":517.93,"location":2,"content":"there are other patterns that were more"},{"from":517.93,"to":520.03,"location":2,"content":"dominant and were chosen was the first"},{"from":520.03,"to":522.25,"location":2,"content":"two principal components so you sort of"},{"from":522.25,"to":524.53,"location":2,"content":"don't want to over trust these things"},{"from":524.53,"to":527.02,"location":2,"content":"and something if you like in phobias you"},{"from":527.02,"to":528.79,"location":2,"content":"might think about is how there are other"},{"from":528.79,"to":530.95,"location":2,"content":"ways that I might be able to represent"},{"from":530.95,"to":532.69,"location":2,"content":"the distances in a way that was more"},{"from":532.69,"to":535.75,"location":2,"content":"accurate anyway this is very simple to"},{"from":535.75,"to":537.85,"location":2,"content":"do right I'm just getting a PCA to"},{"from":537.85,"to":538.81,"location":2,"content":"reduce that"},{"from":538.81,"to":540.78,"location":2,"content":"analogy of the matrix and then"},{"from":540.78,"to":542.35,"location":2,"content":"transforming with it"},{"from":542.35,"to":544.71,"location":2,"content":"these word vectors and printing them"},{"from":544.71,"to":547.9,"location":2,"content":"it's mainly easy to do the bit that"},{"from":547.9,"to":550.48,"location":2,"content":"wasn't easy for me to do but if"},{"from":550.48,"to":552.57,"location":2,"content":"someone's got some clever Python"},{"from":552.57,"to":555.16,"location":2,"content":"plotting tips I'd like one if someone"},{"from":555.16,"to":557.26,"location":2,"content":"wants to send me a message after class I"},{"from":557.26,"to":558.76,"location":2,"content":"would have thought there'd be some"},{"from":558.76,"to":560.95,"location":2,"content":"default way in which you could just"},{"from":560.95,"to":563.62,"location":2,"content":"label points in a scatter plot but I"},{"from":563.62,"to":566.46,"location":2,"content":"wasn't able to find one so what I did"},{"from":566.46,"to":569.2,"location":2,"content":"was I'm just sort of plotting the text"},{"from":569.2,"to":570.76,"location":2,"content":"and I'm offsetting it a little bit from"},{"from":570.76,"to":573.13,"location":2,"content":"the points now that works kind of"},{"from":573.13,"to":574.72,"location":2,"content":"crapoly because they just collide with"},{"from":574.72,"to":577.21,"location":2,"content":"each other as you can see so it'd be"},{"from":577.21,"to":578.56,"location":2,"content":"better if there was a better way to do"},{"from":578.56,"to":581.56,"location":2,"content":"point labeling in Python plots so if"},{"from":581.56,"to":583.6,"location":2,"content":"anyone knows the answer to that one you"},{"from":583.6,"to":589.71,"location":2,"content":"can send it to me okay so that's that"},{"from":589.71,"to":591.73,"location":2,"content":"and if you haven't used the ipython"},{"from":591.73,"to":593.77,"location":2,"content":"notebooks before and you don't want your"},{"from":593.77,"to":595.99,"location":2,"content":"computer to run really slowly it's a"},{"from":595.99,"to":598.09,"location":2,"content":"good idea to halt your ipython notebooks"},{"from":598.09,"to":599.38,"location":2,"content":"when you're not going to be using them"},{"from":599.38,"to":601.66,"location":2,"content":"anymore especially if they're computing"},{"from":601.66,"to":625.73,"location":2,"content":"something okay"},{"from":625.73,"to":632.07,"location":2,"content":"okay so now lecture 2 and so for today"},{"from":632.07,"to":633.48,"location":2,"content":"we're going to keep on talking about"},{"from":633.48,"to":635.79,"location":2,"content":"things you can do with wood vectors and"},{"from":635.79,"to":638.07,"location":2,"content":"say a little bit at the end about word"},{"from":638.07,"to":641.97,"location":2,"content":"sensors so in more detail I'm gonna say"},{"from":641.97,"to":646.05,"location":2,"content":"a bit more about word to vac I'm going"},{"from":646.05,"to":648.03,"location":2,"content":"to have a sort of a very brief excursion"},{"from":648.03,"to":651.6,"location":2,"content":"on optimization but then I sort of want"},{"from":651.6,"to":654.39,"location":2,"content":"to explain a bit more of the space of"},{"from":654.39,"to":658.14,"location":2,"content":"what people have done and can do with"},{"from":658.14,"to":661.05,"location":2,"content":"dense word representations so I'm going"},{"from":661.05,"to":663.09,"location":2,"content":"to say something about count based"},{"from":663.09,"to":666.06,"location":2,"content":"approaches to capturing meaning and how"},{"from":666.06,"to":668.22,"location":2,"content":"do they work I'm going to talk to a bit"},{"from":668.22,"to":668.85,"location":2,"content":"about it"},{"from":668.85,"to":670.68,"location":2,"content":"a different model of word vectors which"},{"from":670.68,"to":675.27,"location":2,"content":"was the glove model that as a postdoc of"},{"from":675.27,"to":679.11,"location":2,"content":"mine Jeffery Pennington and me worked on"},{"from":679.11,"to":681.66,"location":2,"content":"a couple of years ago talk some about"},{"from":681.66,"to":683.85,"location":2,"content":"evaluation really quite dominant theme"},{"from":683.85,"to":685.77,"location":2,"content":"on a lot of what we do are natural"},{"from":685.77,"to":688.17,"location":2,"content":"language processing is how do we how do"},{"from":688.17,"to":690.21,"location":2,"content":"we evaluate things and how much do we"},{"from":690.21,"to":692.79,"location":2,"content":"trust our evaluations and then say a"},{"from":692.79,"to":695.46,"location":2,"content":"little bit about word sensors I have a"},{"from":695.46,"to":697.38,"location":2,"content":"sort of a goal here which is that by the"},{"from":697.38,"to":699.69,"location":2,"content":"end of the class you should actually"},{"from":699.69,"to":702.75,"location":2,"content":"sort of understand enough of the lay of"},{"from":702.75,"to":704.94,"location":2,"content":"the land that you could read papers"},{"from":704.94,"to":706.95,"location":2,"content":"about word vectors such as the ones that"},{"from":706.95,"to":708.66,"location":2,"content":"are in the syllabus and actually"},{"from":708.66,"to":710.31,"location":2,"content":"understand them and where they're coming"},{"from":710.31,"to":712.89,"location":2,"content":"from and roughly how they work and so"},{"from":712.89,"to":714.03,"location":2,"content":"you know if you really want to minimize"},{"from":714.03,"to":716.46,"location":2,"content":"work for York this class you could think"},{"from":716.46,"to":718.14,"location":2,"content":"I know everything I need to know after"},{"from":718.14,"to":719.91,"location":2,"content":"the first week and I'm going to do a"},{"from":719.91,"to":722.19,"location":2,"content":"final project on word vectors and I'll"},{"from":722.19,"to":724.83,"location":2,"content":"be ok and you know you could actually do"},{"from":724.83,"to":728.07,"location":2,"content":"that I'll mention during the class a"},{"from":728.07,"to":730.68,"location":2,"content":"couple of recent pieces of work on word"},{"from":730.68,"to":733.56,"location":2,"content":"vectors on the other hand doing things"},{"from":733.56,"to":735.78,"location":2,"content":"with word vectors is a fairly mined out"},{"from":735.78,"to":738.18,"location":2,"content":"area so you're probably better off I'm"},{"from":738.18,"to":739.53,"location":2,"content":"also listening to some of the later"},{"from":739.53,"to":743.16,"location":2,"content":"parts of the class ok so remember we had"},{"from":743.16,"to":745.38,"location":2,"content":"this idea of word to vac so it was an"},{"from":745.38,"to":748.74,"location":2,"content":"iterative updating algorithm that"},{"from":748.74,"to":751.89,"location":2,"content":"learned these vector representations of"},{"from":751.89,"to":753.84,"location":2,"content":"words that in some sense capture their"},{"from":753.84,"to":755.94,"location":2,"content":"meaning and the way it worked was we"},{"from":755.94,"to":757.65,"location":2,"content":"kind of moved position by position"},{"from":757.65,"to":759.24,"location":2,"content":"through a corpus"},{"from":759.24,"to":761.67,"location":2,"content":"each point in time we had a center word"},{"from":761.67,"to":765.18,"location":2,"content":"here into and it's trying to predict the"},{"from":765.18,"to":767.4,"location":2,"content":"words around that by having a"},{"from":767.4,"to":769.83,"location":2,"content":"probability distribution over words will"},{"from":769.83,"to":771.84,"location":2,"content":"occur around that and that probability"},{"from":771.84,"to":774.66,"location":2,"content":"distribution is defined simply in terms"},{"from":774.66,"to":777.93,"location":2,"content":"of the dot product of the word vectors"},{"from":777.93,"to":781.08,"location":2,"content":"via the softmax function and so what we"},{"from":781.08,"to":783.66,"location":2,"content":"want to do is change those vectors in a"},{"from":783.66,"to":785.85,"location":2,"content":"way that this gives good probability"},{"from":785.85,"to":788.76,"location":2,"content":"predictions it gives as high probability"},{"from":788.76,"to":790.98,"location":2,"content":"as possible to words that you tend to"},{"from":790.98,"to":793.8,"location":2,"content":"see in the context and so just to drill"},{"from":793.8,"to":795.96,"location":2,"content":"that in a little bit more you know what"},{"from":795.96,"to":800.13,"location":2,"content":"we actually have is we have two matrices"},{"from":800.13,"to":803.88,"location":2,"content":"right we have four Center words we have"},{"from":803.88,"to":805.86,"location":2,"content":"a matrix where for each word now"},{"from":805.86,"to":809.13,"location":2,"content":"vocabulary we have a vector and that"},{"from":809.13,"to":810.99,"location":2,"content":"this this is probably as good a point as"},{"from":810.99,"to":813.57,"location":2,"content":"any to say that it turns out that all"},{"from":813.57,"to":815.7,"location":2,"content":"the major deep learning packages"},{"from":815.7,"to":819.12,"location":2,"content":"tensorflow pi torch etc for their word"},{"from":819.12,"to":821.82,"location":2,"content":"vectors the word vectors are represented"},{"from":821.82,"to":824.16,"location":2,"content":"as rows if you've done a bunch of math"},{"from":824.16,"to":826.26,"location":2,"content":"classes that might not be what you would"},{"from":826.26,"to":827.97,"location":2,"content":"expect you might have expected the other"},{"from":827.97,"to":830.16,"location":2,"content":"way around but they all put them in rows"},{"from":830.16,"to":834.03,"location":2,"content":"so we can have rows for our so we have"},{"from":834.03,"to":836.25,"location":2,"content":"six words and a five dimensional vector"},{"from":836.25,"to":839.72,"location":2,"content":"each okay and then we have this outside"},{"from":839.72,"to":842.31,"location":2,"content":"matrix where we also have a second"},{"from":842.31,"to":844.89,"location":2,"content":"vector for each word which is this"},{"from":844.89,"to":849.3,"location":2,"content":"representation in context so when we"},{"from":849.3,"to":851.58,"location":2,"content":"have a particular Center word here word"},{"from":851.58,"to":853.62,"location":2,"content":"for you know when we're doing our"},{"from":853.62,"to":856.29,"location":2,"content":"computations we're taking a dot product"},{"from":856.29,"to":861.03,"location":2,"content":"between v4 and each row of U and that's"},{"from":861.03,"to":864.51,"location":2,"content":"then giving us a vector of dot products"},{"from":864.51,"to":867.45,"location":2,"content":"scores and so then after that we're"},{"from":867.45,"to":869.55,"location":2,"content":"running soft maxes on each of those"},{"from":869.55,"to":872.61,"location":2,"content":"numbers doing it element wise and that's"},{"from":872.61,"to":873.69,"location":2,"content":"then giving us a probability"},{"from":873.69,"to":876.78,"location":2,"content":"distribution over words in the context"},{"from":876.78,"to":879.84,"location":2,"content":"and there's sort of things to notice"},{"from":879.84,"to":882.96,"location":2,"content":"there which hopefully you noticed last"},{"from":882.96,"to":884.76,"location":2,"content":"time but to make sure you notice that"},{"from":884.76,"to":887.94,"location":2,"content":"you know we've just got one probability"},{"from":887.94,"to":890.25,"location":2,"content":"distribution right so in terms of what"},{"from":890.25,"to":891.95,"location":2,"content":"words we predict we're pretty"},{"from":891.95,"to":893.72,"location":2,"content":"acting exactly the same probability"},{"from":893.72,"to":896.3,"location":2,"content":"distribution every position we've sort"},{"from":896.3,"to":898.04,"location":2,"content":"of saying the most likely word one to"},{"from":898.04,"to":900.68,"location":2,"content":"the left is whatever it is house the"},{"from":900.68,"to":902.27,"location":2,"content":"most likely word two to the left is"},{"from":902.27,"to":904.67,"location":2,"content":"house three de left is house one of the"},{"from":904.67,"to":907.01,"location":2,"content":"right should be house too right so it's"},{"from":907.01,"to":908.81,"location":2,"content":"sort of know sort of fineness of"},{"from":908.81,"to":911.17,"location":2,"content":"prediction it's just an overall kind of"},{"from":911.17,"to":913.73,"location":2,"content":"probability distribution of words that"},{"from":913.73,"to":916.49,"location":2,"content":"are likely to occur in my context so all"},{"from":916.49,"to":919.46,"location":2,"content":"we're asking for is a model that gives"},{"from":919.46,"to":922.07,"location":2,"content":"reasonably high probability estimates to"},{"from":922.07,"to":925.64,"location":2,"content":"all words that occur in the context of"},{"from":925.64,"to":928.37,"location":2,"content":"this word relatively often there's"},{"from":928.37,"to":930.32,"location":2,"content":"nothing more to it than that and that's"},{"from":930.32,"to":932.06,"location":2,"content":"part of why it's sort of surprising when"},{"from":932.06,"to":934.4,"location":2,"content":"you've got such a simplistic thing that"},{"from":934.4,"to":936.35,"location":2,"content":"it seems like at the end of the day it"},{"from":936.35,"to":939.14,"location":2,"content":"can end up capturing so much about the"},{"from":939.14,"to":941.51,"location":2,"content":"meanings of words and aspects of the"},{"from":941.51,"to":943.94,"location":2,"content":"meanings of words like in the examples I"},{"from":943.94,"to":945.73,"location":2,"content":"was just showing you in the ipython"},{"from":945.73,"to":948.02,"location":2,"content":"notebook"},{"from":948.02,"to":951.98,"location":2,"content":"and there's one other thing that I was"},{"from":951.98,"to":954.14,"location":2,"content":"gonna say oh yeah one other thing I was"},{"from":954.14,"to":956.24,"location":2,"content":"going to say was the other thing that"},{"from":956.24,"to":959.96,"location":2,"content":"might occur to you from this is well"},{"from":959.96,"to":962.36,"location":2,"content":"wait a minute there was like that and"},{"from":962.36,"to":966.8,"location":2,"content":"and and of that occur all the time so"},{"from":966.8,"to":971.42,"location":2,"content":"that means every word must have a high"},{"from":971.42,"to":975.11,"location":2,"content":"dot product with words like that an oven"},{"from":975.11,"to":978.53,"location":2,"content":"and to get their probabilities right and"},{"from":978.53,"to":982.22,"location":2,"content":"the first answer to that is yup that's"},{"from":982.22,"to":984.59,"location":2,"content":"true and it turns out that all word"},{"from":984.59,"to":988.4,"location":2,"content":"vectors have a very strong word"},{"from":988.4,"to":990.59,"location":2,"content":"probability component that reflects that"},{"from":990.59,"to":993.8,"location":2,"content":"and I mean one of the things that some"},{"from":993.8,"to":996.83,"location":2,"content":"workers discussed so on the readings"},{"from":996.83,"to":998.72,"location":2,"content":"there are two papers from Sanjeev"},{"from":998.72,"to":1001.09,"location":2,"content":"Aurora's group in Princeton and one of"},{"from":1001.09,"to":1004.39,"location":2,"content":"those papers sort of discusses this"},{"from":1004.39,"to":1007.15,"location":2,"content":"probability high frequency effect and"},{"from":1007.15,"to":1010,"location":2,"content":"you know a crude way of actually fixing"},{"from":1010,"to":1012.19,"location":2,"content":"this high frequency effect is that"},{"from":1012.19,"to":1017.14,"location":2,"content":"normally the first the first biggest"},{"from":1017.14,"to":1019.39,"location":2,"content":"component in your word vectors is"},{"from":1019.39,"to":1021.46,"location":2,"content":"actually a frequency effect and if you"},{"from":1021.46,"to":1022.93,"location":2,"content":"just lop it off you can make your"},{"from":1022.93,"to":1025.76,"location":2,"content":"semantic similarities better"},{"from":1025.76,"to":1027.83,"location":2,"content":"but there are other things that we do to"},{"from":1027.83,"to":1030.41,"location":2,"content":"sort of deal with high frequencies okay"},{"from":1030.41,"to":1032.84,"location":2,"content":"so we get these lovely spacers that I've"},{"from":1032.84,"to":1035.54,"location":2,"content":"shown some of but I'll make one more"},{"from":1035.54,"to":1040.76,"location":2,"content":"remark yeah so did I say this last time"},{"from":1040.76,"to":1046.52,"location":2,"content":"oh my remark anyway is that we show all"},{"from":1046.52,"to":1048.8,"location":2,"content":"these two-dimensional pictures they're"},{"from":1048.8,"to":1051.56,"location":2,"content":"exceedingly exceedingly misleading"},{"from":1051.56,"to":1054.62,"location":2,"content":"because in these pick two-dimensional"},{"from":1054.62,"to":1057.71,"location":2,"content":"pictures you know you have these effects"},{"from":1057.71,"to":1061.25,"location":2,"content":"that if you know Samsung is close to"},{"from":1061.25,"to":1064.19,"location":2,"content":"Nokia it has to be over here and then it"},{"from":1064.19,"to":1066.44,"location":2,"content":"has to be far away from words that are"},{"from":1066.44,"to":1069.5,"location":2,"content":"over here whereas you might sort of also"},{"from":1069.5,"to":1071.48,"location":2,"content":"want to have the effect that Nokia is"},{"from":1071.48,"to":1073.85,"location":2,"content":"close to Finland for a different reason"},{"from":1073.85,"to":1076.81,"location":2,"content":"and you can't do that in two dimensional"},{"from":1076.81,"to":1080.17,"location":2,"content":"vector spaces but you know one of the"},{"from":1080.17,"to":1082.4,"location":2,"content":"most of the properties of high"},{"from":1082.4,"to":1084.08,"location":2,"content":"dimensional vector spaces are very"},{"from":1084.08,"to":1086.48,"location":2,"content":"unintuitive and one of the ways that"},{"from":1086.48,"to":1088.22,"location":2,"content":"they're unintuitive is in a high"},{"from":1088.22,"to":1090.53,"location":2,"content":"dimensional vector space a word can be"},{"from":1090.53,"to":1093.41,"location":2,"content":"close to lots of other words in"},{"from":1093.41,"to":1099.14,"location":2,"content":"different directions okay so we sort of"},{"from":1099.14,"to":1102.34,"location":2,"content":"started to talk about how we went about"},{"from":1102.34,"to":1105.56,"location":2,"content":"learning these word vectors I'm sort of"},{"from":1105.56,"to":1109.52,"location":2,"content":"going to take about a five minute detour"},{"from":1109.52,"to":1112.4,"location":2,"content":"into optimization now this isn't really"},{"from":1112.4,"to":1114.62,"location":2,"content":"an optimization class if you want to"},{"from":1114.62,"to":1116.54,"location":2,"content":"learn a lot about optimization well you"},{"from":1116.54,"to":1118.67,"location":2,"content":"can learn more about optimization if you"},{"from":1118.67,"to":1120.74,"location":2,"content":"do 229 and if you do something like"},{"from":1120.74,"to":1123.05,"location":2,"content":"Stephen Boyd's optimization class you"},{"from":1123.05,"to":1125.63,"location":2,"content":"can learn a lot of optimization but this"},{"from":1125.63,"to":1127.58,"location":2,"content":"is so a really baby optimization but"},{"from":1127.58,"to":1129.02,"location":2,"content":"just to make sure it runs on the same"},{"from":1129.02,"to":1132.8,"location":2,"content":"page here are three slides right so what"},{"from":1132.8,"to":1135.17,"location":2,"content":"we did at the end what we did over there"},{"from":1135.17,"to":1137.6,"location":2,"content":"where I apologize that my writing was"},{"from":1137.6,"to":1140.6,"location":2,"content":"too small but that will give you the"},{"from":1140.6,"to":1142.91,"location":2,"content":"chance to when doing homework two and"},{"from":1142.91,"to":1144.92,"location":2,"content":"you have to write that out to work it"},{"from":1144.92,"to":1147.35,"location":2,"content":"out for yourself and learn more in the"},{"from":1147.35,"to":1150.68,"location":2,"content":"process right so what we had was a cost"},{"from":1150.68,"to":1152.75,"location":2,"content":"function when we wanted to minimize and"},{"from":1152.75,"to":1155.33,"location":2,"content":"so what we did was we did a bit of"},{"from":1155.33,"to":1158.06,"location":2,"content":"calculus to count calculate the gradient"},{"from":1158.06,"to":1159.35,"location":2,"content":"of the cost function"},{"from":1159.35,"to":1162.47,"location":2,"content":"with respect to our word vectors which"},{"from":1162.47,"to":1165.17,"location":2,"content":"were our variables theta and then what"},{"from":1165.17,"to":1168.11,"location":2,"content":"we want to do is say well if we take a"},{"from":1168.11,"to":1171.53,"location":2,"content":"small step in the direction of the"},{"from":1171.53,"to":1173.06,"location":2,"content":"negative of the gradient"},{"from":1173.06,"to":1176.36,"location":2,"content":"that'll be taking us down down hill in"},{"from":1176.36,"to":1178.58,"location":2,"content":"this space and we want to keep on doing"},{"from":1178.58,"to":1181.43,"location":2,"content":"that and sort of head to the minimum of"},{"from":1181.43,"to":1183.74,"location":2,"content":"our space I mean of course in our high"},{"from":1183.74,"to":1185.93,"location":2,"content":"multi-dimensional space you know it"},{"from":1185.93,"to":1187.82,"location":2,"content":"might not be a nice smooth curve like"},{"from":1187.82,"to":1189.77,"location":2,"content":"this it might be a horrible and non"},{"from":1189.77,"to":1193.19,"location":2,"content":"convex curve but that's just the idea so"},{"from":1193.19,"to":1194.75,"location":2,"content":"essentially we're saying we've got the"},{"from":1194.75,"to":1198.47,"location":2,"content":"old parameters we work out the gradient"},{"from":1198.47,"to":1200.12,"location":2,"content":"of the objective function using those"},{"from":1200.12,"to":1202.85,"location":2,"content":"old parameters we multiply that by a"},{"from":1202.85,"to":1206.9,"location":2,"content":"small alpha which is our step size or"},{"from":1206.9,"to":1208.34,"location":2,"content":"learning rate because we only want to"},{"from":1208.34,"to":1210.89,"location":2,"content":"move a little bit each time because if"},{"from":1210.89,"to":1213.86,"location":2,"content":"back here if we sort of said downhill is"},{"from":1213.86,"to":1216.14,"location":2,"content":"this way and said great let's go a long"},{"from":1216.14,"to":1218.03,"location":2,"content":"way that way you could kind of complete"},{"from":1218.03,"to":1219.62,"location":2,"content":"the overshoot so we only want to go a"},{"from":1219.62,"to":1222.44,"location":2,"content":"little bit each time so we normally have"},{"from":1222.44,"to":1224.87,"location":2,"content":"a small learning rate alpha and so we"},{"from":1224.87,"to":1227.21,"location":2,"content":"subtract a small multiple of the"},{"from":1227.21,"to":1230,"location":2,"content":"gradient and we from the old parameters"},{"from":1230,"to":1232.91,"location":2,"content":"and we get our new parameters and that's"},{"from":1232.91,"to":1234.44,"location":2,"content":"sort of effectively being worked out"},{"from":1234.44,"to":1237.32,"location":2,"content":"component wise as is shown below that"},{"from":1237.32,"to":1238.7,"location":2,"content":"we're just doing that for each of the"},{"from":1238.7,"to":1241.4,"location":2,"content":"partial derivatives and then that our"},{"from":1241.4,"to":1243.35,"location":2,"content":"hope is that that will let us gradually"},{"from":1243.35,"to":1246.44,"location":2,"content":"walk down this surface now if you"},{"from":1246.44,"to":1248.72,"location":2,"content":"actually did this it would be"},{"from":1248.72,"to":1251.27,"location":2,"content":"unbelievably bad for the kind of systems"},{"from":1251.27,"to":1253.79,"location":2,"content":"that we built and there's a lot of work"},{"from":1253.79,"to":1256.28,"location":2,"content":"on clever optimization but the most"},{"from":1256.28,"to":1259.22,"location":2,"content":"basic thing which you definitely need to"},{"from":1259.22,"to":1263.93,"location":2,"content":"know is that well our objective function"},{"from":1263.93,"to":1267.62,"location":2,"content":"here J of theta was a function of our"},{"from":1267.62,"to":1270.26,"location":2,"content":"entire corpus right and to get this to"},{"from":1270.26,"to":1272.24,"location":2,"content":"work well the first thing you want to do"},{"from":1272.24,"to":1274.79,"location":2,"content":"is you know collect a few billion words"},{"from":1274.79,"to":1277.43,"location":2,"content":"of your favorite language and then say"},{"from":1277.43,"to":1279.65,"location":2,"content":"go and build a word to back model for me"},{"from":1279.65,"to":1284.9,"location":2,"content":"and so if you have to evaluate a billion"},{"from":1284.9,"to":1288.29,"location":2,"content":"Center words and maybe then for each of"},{"from":1288.29,"to":1290.42,"location":2,"content":"ten billion context words if you have"},{"from":1290.42,"to":1292.7,"location":2,"content":"the window size of five and"},{"from":1292.7,"to":1294.68,"location":2,"content":"so you have to do these sort of 10"},{"from":1294.68,"to":1298.61,"location":2,"content":"billion softmax calculations before you"},{"from":1298.61,"to":1300.98,"location":2,"content":"work out what your gradient is that"},{"from":1300.98,"to":1302.18,"location":2,"content":"you're going to be having your computer"},{"from":1302.18,"to":1305.15,"location":2,"content":"computer for quite a long time before"},{"from":1305.15,"to":1307.22,"location":2,"content":"you make one little step in the gradient"},{"from":1307.22,"to":1309.14,"location":2,"content":"and so things are going to go so so"},{"from":1309.14,"to":1312.2,"location":2,"content":"slowly so no one does that in deep"},{"from":1312.2,"to":1315.17,"location":2,"content":"learning systems so what people everyone"},{"from":1315.17,"to":1317.78,"location":2,"content":"does is use the Casta gradient descent"},{"from":1317.78,"to":1320.9,"location":2,"content":"and in stochastic gradient descent we"},{"from":1320.9,"to":1324.73,"location":2,"content":"sample our window in the simplest case"},{"from":1324.73,"to":1329.75,"location":2,"content":"we just for this one window work out an"},{"from":1329.75,"to":1332.24,"location":2,"content":"estimate of the gradient and we use it"},{"from":1332.24,"to":1334.85,"location":2,"content":"as a parameter update so this is sort of"},{"from":1334.85,"to":1338.96,"location":2,"content":"an amazingly amazingly noisy estimate of"},{"from":1338.96,"to":1341.45,"location":2,"content":"the gradient but it sort of doesn't"},{"from":1341.45,"to":1343.19,"location":2,"content":"matter too much because as soon as we've"},{"from":1343.19,"to":1344.57,"location":2,"content":"done it we're gonna choose a different"},{"from":1344.57,"to":1346.97,"location":2,"content":"Center word and do it again and again so"},{"from":1346.97,"to":1348.98,"location":2,"content":"that gradually we sort of approach what"},{"from":1348.98,"to":1350.9,"location":2,"content":"we would have gotten if we'd sort of"},{"from":1350.9,"to":1353.42,"location":2,"content":"looked at all of the Center words before"},{"from":1353.42,"to":1355.91,"location":2,"content":"we took any steps but because we take"},{"from":1355.91,"to":1359.09,"location":2,"content":"steps as we go we get to the minimum of"},{"from":1359.09,"to":1361.85,"location":2,"content":"the function orders of magnitude more"},{"from":1361.85,"to":1366.8,"location":2,"content":"quickly so this shows the simplest case"},{"from":1366.8,"to":1368.99,"location":2,"content":"where we just sampling one window in"},{"from":1368.99,"to":1371.57,"location":2,"content":"practice that's not what we normally do"},{"from":1371.57,"to":1375.98,"location":2,"content":"we normally sample us a small bunch you"},{"from":1375.98,"to":1380.3,"location":2,"content":"know order approximately 32 or 64 so if"},{"from":1380.3,"to":1383.03,"location":2,"content":"we have a sample that's bigger"},{"from":1383.03,"to":1384.59,"location":2,"content":"that's generally referred to as a mini"},{"from":1384.59,"to":1387.2,"location":2,"content":"batch and we calculate a gradient"},{"from":1387.2,"to":1390.56,"location":2,"content":"estimate from the mini batch so that has"},{"from":1390.56,"to":1394.16,"location":2,"content":"two advantages one advantage is that you"},{"from":1394.16,"to":1396.62,"location":2,"content":"kind of get less noisy estimates of the"},{"from":1396.62,"to":1398.45,"location":2,"content":"gradient because you've kind of averaged"},{"from":1398.45,"to":1400.64,"location":2,"content":"over a bunch of examples rather than"},{"from":1400.64,"to":1403.46,"location":2,"content":"just using one but the second advantage"},{"from":1403.46,"to":1405.95,"location":2,"content":"which is the one way we really care is"},{"from":1405.95,"to":1408.35,"location":2,"content":"if we want our computations to go fast"},{"from":1408.35,"to":1412.64,"location":2,"content":"when we're using a GPU that you need to"},{"from":1412.64,"to":1414.74,"location":2,"content":"get parallelization of doing the same"},{"from":1414.74,"to":1416.84,"location":2,"content":"operation a whole bunch of times and"},{"from":1416.84,"to":1419.21,"location":2,"content":"then you gain a lot by using a mini"},{"from":1419.21,"to":1421.55,"location":2,"content":"batch of 64 examples or something like"},{"from":1421.55,"to":1424.4,"location":2,"content":"that and you don't have to but you know"},{"from":1424.4,"to":1426.29,"location":2,"content":"it turns out the details of the guts of"},{"from":1426.29,"to":1426.63,"location":2,"content":"the"},{"from":1426.63,"to":1428.37,"location":2,"content":"I'd wear that you know there's nvidia"},{"from":1428.37,"to":1431.07,"location":2,"content":"gpus you know have these whatever they"},{"from":1431.07,"to":1432.84,"location":2,"content":"have is inside them their own powers of"},{"from":1432.84,"to":1435.24,"location":2,"content":"two so you get better speed ups if you"},{"from":1435.24,"to":1438.39,"location":2,"content":"use batches like 32 or 64 rather than"},{"from":1438.39,"to":1440.25,"location":2,"content":"just deciding that 42 is still your"},{"from":1440.25,"to":1441.84,"location":2,"content":"favorite number from high school and"},{"from":1441.84,"to":1444.18,"location":2,"content":"you're gonna use that as the size of"},{"from":1444.18,"to":1450.87,"location":2,"content":"your mini batch okay yeah here's one"},{"from":1450.87,"to":1453.87,"location":2,"content":"other interesting thing which actually"},{"from":1453.87,"to":1456.21,"location":2,"content":"has some optimization details in it it"},{"from":1456.21,"to":1459.93,"location":2,"content":"turns out if you think of these doing"},{"from":1459.93,"to":1462.45,"location":2,"content":"stochastic gradients with word vectors"},{"from":1462.45,"to":1464.7,"location":2,"content":"it's actually very different to some"},{"from":1464.7,"to":1466.59,"location":2,"content":"other deep learning problems like vision"},{"from":1466.59,"to":1469.35,"location":2,"content":"deep learning problems because for"},{"from":1469.35,"to":1471.66,"location":2,"content":"either a single window or even a sort of"},{"from":1471.66,"to":1474.27,"location":2,"content":"a reasonably sized mini batch it'll turn"},{"from":1474.27,"to":1477.21,"location":2,"content":"out that those mini batches the mini"},{"from":1477.21,"to":1479.88,"location":2,"content":"batch only has you know relatively"},{"from":1479.88,"to":1481.89,"location":2,"content":"speaking a handful of words in it right"},{"from":1481.89,"to":1483.66,"location":2,"content":"so if you have a mini batch of size 32"},{"from":1483.66,"to":1486.48,"location":2,"content":"in a window size of 10 you know probably"},{"from":1486.48,"to":1488.97,"location":2,"content":"there are only about 100 hundred 50"},{"from":1488.97,"to":1491.31,"location":2,"content":"different words in it but yet we're"},{"from":1491.31,"to":1493.8,"location":2,"content":"building this model over a vocabulary of"},{"from":1493.8,"to":1495.51,"location":2,"content":"quarter of a million words or something"},{"from":1495.51,"to":1497.64,"location":2,"content":"like that so just about all of the"},{"from":1497.64,"to":1502.91,"location":2,"content":"elements in this vector are zero and so"},{"from":1502.91,"to":1506.84,"location":2,"content":"we sort of really have this very sparse"},{"from":1506.84,"to":1511.44,"location":2,"content":"parameter update and so that sort of"},{"from":1511.44,"to":1514.65,"location":2,"content":"suggests that we actually probably want"},{"from":1514.65,"to":1517.17,"location":2,"content":"to sort of only update the word vectors"},{"from":1517.17,"to":1519.84,"location":2,"content":"that appear and then the question is"},{"from":1519.84,"to":1521.46,"location":2,"content":"whether you can achieve that right the"},{"from":1521.46,"to":1522.99,"location":2,"content":"dumb way to do it is you just have this"},{"from":1522.99,"to":1526.11,"location":2,"content":"matrix that's normally nearly all zeros"},{"from":1526.11,"to":1529.08,"location":2,"content":"and you say add those two matrices"},{"from":1529.08,"to":1532.23,"location":2,"content":"together and there you go and then the"},{"from":1532.23,"to":1534.66,"location":2,"content":"question is can you actually have a"},{"from":1534.66,"to":1537.66,"location":2,"content":"sparse matrix update which only updates"},{"from":1537.66,"to":1540.57,"location":2,"content":"the certain rows of the matrix that"},{"from":1540.57,"to":1542.25,"location":2,"content":"contain the words that you've entered"},{"from":1542.25,"to":1545.37,"location":2,"content":"and do things much faster and if you're"},{"from":1545.37,"to":1547.23,"location":2,"content":"doing something even cleverer like doing"},{"from":1547.23,"to":1549.74,"location":2,"content":"distributed computation over multiple"},{"from":1549.74,"to":1552.09,"location":2,"content":"computers and sharing your parameters"},{"from":1552.09,"to":1553.38,"location":2,"content":"well then definitely you just sort of"},{"from":1553.38,"to":1555.45,"location":2,"content":"only want to update the word vectors"},{"from":1555.45,"to":1557.1,"location":2,"content":"that you've actually been getting a"},{"from":1557.1,"to":1559.23,"location":2,"content":"parameter estimate for so there's sort"},{"from":1559.23,"to":1560.61,"location":2,"content":"of some details there"},{"from":1560.61,"to":1562.26,"location":2,"content":"but I'm gonna skip past them more"},{"from":1562.26,"to":1566.4,"location":2,"content":"details right so a couple of people"},{"from":1566.4,"to":1569.25,"location":2,"content":"asked afterwards yeah why are there"},{"from":1569.25,"to":1571.8,"location":2,"content":"these two word vectors they're sort of"},{"from":1571.8,"to":1575.16,"location":2,"content":"Center and the outside one and I mean"},{"from":1575.16,"to":1577.68,"location":2,"content":"the answer to that is it makes that mat"},{"from":1577.68,"to":1581.91,"location":2,"content":"I showed you easy right so that if if"},{"from":1581.91,"to":1585.75,"location":2,"content":"you do it as I showed you well you know"},{"from":1585.75,"to":1589.74,"location":2,"content":"for working out the partial derivatives"},{"from":1589.74,"to":1592.98,"location":2,"content":"for the center word it's just as I"},{"from":1592.98,"to":1597.15,"location":2,"content":"showed you it's easy but if you use only"},{"from":1597.15,"to":1601.05,"location":2,"content":"one set of word vectors well then the"},{"from":1601.05,"to":1603.42,"location":2,"content":"same word that's the center of word will"},{"from":1603.42,"to":1606.18,"location":2,"content":"be one of the choices for the context"},{"from":1606.18,"to":1607.65,"location":2,"content":"word when you're working out that"},{"from":1607.65,"to":1610.26,"location":2,"content":"softmax for the context word and then"},{"from":1610.26,"to":1612.48,"location":2,"content":"you'll get these terms that are then"},{"from":1612.48,"to":1615.12,"location":2,"content":"squared terms in terms of the two"},{"from":1615.12,"to":1618.15,"location":2,"content":"references of that same word and that"},{"from":1618.15,"to":1621.63,"location":2,"content":"makes your math more difficult so it's"},{"from":1621.63,"to":1624.6,"location":2,"content":"sort of just a practical thing in the"},{"from":1624.6,"to":1627.12,"location":2,"content":"end I mean it sort of doesn't make very"},{"from":1627.12,"to":1629.46,"location":2,"content":"much difference because if you sort of"},{"from":1629.46,"to":1630.96,"location":2,"content":"think about it since you're going along"},{"from":1630.96,"to":1633.81,"location":2,"content":"through all the positions you know what"},{"from":1633.81,"to":1635.82,"location":2,"content":"was a Center word at one point is"},{"from":1635.82,"to":1637.89,"location":2,"content":"immediately afterwards the context word"},{"from":1637.89,"to":1640.71,"location":2,"content":"of what used to be a context word which"},{"from":1640.71,"to":1642.42,"location":2,"content":"is now the center words they sort of"},{"from":1642.42,"to":1646.02,"location":2,"content":"doing the same computations because you"},{"from":1646.02,"to":1647.58,"location":2,"content":"know the dot product is symmetric"},{"from":1647.58,"to":1650.64,"location":2,"content":"actually all over again"},{"from":1650.64,"to":1652.98,"location":2,"content":"so you've they get pretty similar vector"},{"from":1652.98,"to":1655.2,"location":2,"content":"representations so it seems like in"},{"from":1655.2,"to":1656.61,"location":2,"content":"general you can get the best results by"},{"from":1656.61,"to":1658.56,"location":2,"content":"averaging what comes out for your two"},{"from":1658.56,"to":1660.36,"location":2,"content":"vectors and you end up with just one"},{"from":1660.36,"to":1664.73,"location":2,"content":"vector per word okay more substantively"},{"from":1664.73,"to":1668.79,"location":2,"content":"if you go to the word two vector you"},{"from":1668.79,"to":1670.26,"location":2,"content":"will discover that they're sort of more"},{"from":1670.26,"to":1672.66,"location":2,"content":"two-word to Veck that they define the"},{"from":1672.66,"to":1675.09,"location":2,"content":"sort of a family of word to Veck muddles"},{"from":1675.09,"to":1677.67,"location":2,"content":"and there are so two main parts of that"},{"from":1677.67,"to":1680.82,"location":2,"content":"family firstly there's a choice between"},{"from":1680.82,"to":1683.43,"location":2,"content":"the continuous bag of words model and"},{"from":1683.43,"to":1685.8,"location":2,"content":"the skip grams model and what I"},{"from":1685.8,"to":1688.08,"location":2,"content":"presented was the Skip Graham's model so"},{"from":1688.08,"to":1689.94,"location":2,"content":"in the skip Graham's model you've got"},{"from":1689.94,"to":1692.13,"location":2,"content":"one Center word and you're trying to"},{"from":1692.13,"to":1693.78,"location":2,"content":"predict all the words in"},{"from":1693.78,"to":1696.9,"location":2,"content":"takes one at a time for the continuous"},{"from":1696.9,"to":1698.94,"location":2,"content":"bag of words model it's the opposite"},{"from":1698.94,"to":1701.67,"location":2,"content":"you've got all of the outside words and"},{"from":1701.67,"to":1704.25,"location":2,"content":"you're trying to use all of them though"},{"from":1704.25,"to":1706.29,"location":2,"content":"considered independently like a naive"},{"from":1706.29,"to":1709.97,"location":2,"content":"Bayes model to predict the center word"},{"from":1709.97,"to":1715.02,"location":2,"content":"and then the second one is the way I"},{"from":1715.02,"to":1718.29,"location":2,"content":"presented learning this was the method"},{"from":1718.29,"to":1719.64,"location":2,"content":"that's using the so called"},{"from":1719.64,"to":1722.52,"location":2,"content":"naive softmax so therefore when we were"},{"from":1722.52,"to":1725.04,"location":2,"content":"wanting to work things out we were sort"},{"from":1725.04,"to":1727.17,"location":2,"content":"of saying okay we want probability"},{"from":1727.17,"to":1729.78,"location":2,"content":"estimates for the context words and so"},{"from":1729.78,"to":1731.25,"location":2,"content":"we're just going to sum over the whole"},{"from":1731.25,"to":1733.89,"location":2,"content":"vocabulary and we'll come up with these"},{"from":1733.89,"to":1737.7,"location":2,"content":"probability estimates in practice that"},{"from":1737.7,"to":1740.25,"location":2,"content":"turns out to be a sort of a bad idea"},{"from":1740.25,"to":1742.98,"location":2,"content":"because that would also make things mega"},{"from":1742.98,"to":1746.79,"location":2,"content":"slow so in homework 2 coming up next"},{"from":1746.79,"to":1749.85,"location":2,"content":"week you will get to implement a much"},{"from":1749.85,"to":1752.85,"location":2,"content":"more practical way of doing this which"},{"from":1752.85,"to":1755.07,"location":2,"content":"they present in the word to vectors"},{"from":1755.07,"to":1757.17,"location":2,"content":"right so the problem is if we're using"},{"from":1757.17,"to":1759.96,"location":2,"content":"this equation that we used to do the"},{"from":1759.96,"to":1762.45,"location":2,"content":"calculus that down in this denominator"},{"from":1762.45,"to":1765.51,"location":2,"content":"here we're doing the sum over the entire"},{"from":1765.51,"to":1767.43,"location":2,"content":"vocabulary so if you have a vocabulary"},{"from":1767.43,"to":1769.62,"location":2,"content":"quarter million words we're sort of"},{"from":1769.62,"to":1771,"location":2,"content":"doing a quarter of a million dot"},{"from":1771,"to":1773.34,"location":2,"content":"products and Exponential's and adding"},{"from":1773.34,"to":1774.93,"location":2,"content":"them all to and work out that"},{"from":1774.93,"to":1777.93,"location":2,"content":"denominator and that sort of seems a"},{"from":1777.93,"to":1779.97,"location":2,"content":"sort of a really bad idea if you want"},{"from":1779.97,"to":1781.14,"location":2,"content":"things to be fast"},{"from":1781.14,"to":1785.82,"location":2,"content":"so Thomas Miko often colleagues came up"},{"from":1785.82,"to":1787.74,"location":2,"content":"with this idea of negative sampling"},{"from":1787.74,"to":1789.72,"location":2,"content":"would be near enough and so the idea of"},{"from":1789.72,"to":1791.82,"location":2,"content":"negative sampling is we're going to"},{"from":1791.82,"to":1794.76,"location":2,"content":"Train binary logistic regressions"},{"from":1794.76,"to":1797.49,"location":2,"content":"instead and so we're going to train one"},{"from":1797.49,"to":1800.34,"location":2,"content":"binary logistic regression for the"},{"from":1800.34,"to":1802.71,"location":2,"content":"actual word observed what's in the"},{"from":1802.71,"to":1805.26,"location":2,"content":"numerator and you want to give high"},{"from":1805.26,"to":1807.45,"location":2,"content":"probability to the word that was"},{"from":1807.45,"to":1810.93,"location":2,"content":"actually observed and then what we're"},{"from":1810.93,"to":1812.79,"location":2,"content":"going to do is we're going to sort of"},{"from":1812.79,"to":1815.37,"location":2,"content":"randomly sample a bunch of other words"},{"from":1815.37,"to":1818.1,"location":2,"content":"they're the negative samples and say"},{"from":1818.1,"to":1820.71,"location":2,"content":"they weren't the ones that were actually"},{"from":1820.71,"to":1823.29,"location":2,"content":"seen so you should be trying to give"},{"from":1823.29,"to":1826.97,"location":2,"content":"them as lower probability as possible"},{"from":1826.97,"to":1831.05,"location":2,"content":"okay so the sort of notation that they"},{"from":1831.05,"to":1833.3,"location":2,"content":"use in the paper is so slightly"},{"from":1833.3,"to":1835.52,"location":2,"content":"different to the one I've used and they"},{"from":1835.52,"to":1837.08,"location":2,"content":"actually do maximization not"},{"from":1837.08,"to":1840.23,"location":2,"content":"minimization and that's their equation"},{"from":1840.23,"to":1844.64,"location":2,"content":"which I'll come back to there before we"},{"from":1844.64,"to":1846.86,"location":2,"content":"do that here's the sigmoid function so"},{"from":1846.86,"to":1848.78,"location":2,"content":"the sigmoid functions normally written"},{"from":1848.78,"to":1851.51,"location":2,"content":"like this 1 over 1 plus e to the minus X"},{"from":1851.51,"to":1855.74,"location":2,"content":"but essentially the sigmoid function is"},{"from":1855.74,"to":1858.38,"location":2,"content":"like a binary case of the softmax"},{"from":1858.38,"to":1860.87,"location":2,"content":"function right that we have two possible"},{"from":1860.87,"to":1863.93,"location":2,"content":"outcomes yes and no and that you're sort"},{"from":1863.93,"to":1866.84,"location":2,"content":"of again got an import that is any real"},{"from":1866.84,"to":1869.3,"location":2,"content":"number and it's mapping it onto a"},{"from":1869.3,"to":1872.03,"location":2,"content":"probability distribution between 0 and 1"},{"from":1872.03,"to":1873.98,"location":2,"content":"which represents these two binary"},{"from":1873.98,"to":1876.05,"location":2,"content":"outcomes and the extent that the numbers"},{"from":1876.05,"to":1878.63,"location":2,"content":"positive it kind of ceilings 2 1 and"},{"from":1878.63,"to":1882.38,"location":2,"content":"negative goes down to 0 ok so with this"},{"from":1882.38,"to":1885.02,"location":2,"content":"time we're going to take the dot prefer"},{"from":1885.02,"to":1887.09,"location":2,"content":"the good word we're going to take the"},{"from":1887.09,"to":1889.88,"location":2,"content":"dot product of the two vectors shove it"},{"from":1889.88,"to":1892.19,"location":2,"content":"through a sigmoid function and then"},{"from":1892.19,"to":1893.42,"location":2,"content":"we're going to want that probability"},{"from":1893.42,"to":1897.5,"location":2,"content":"estimate to be as high as possible so if"},{"from":1897.5,"to":1899.33,"location":2,"content":"I show you this version which is just"},{"from":1899.33,"to":1902.3,"location":2,"content":"written slightly differently to look as"},{"from":1902.3,"to":1904.91,"location":2,"content":"much as possible like the notation that"},{"from":1904.91,"to":1906.65,"location":2,"content":"we use last time"},{"from":1906.65,"to":1908.99,"location":2,"content":"here is our new objective function for"},{"from":1908.99,"to":1910.97,"location":2,"content":"using negative sound playing and we've"},{"from":1910.97,"to":1915.17,"location":2,"content":"got two terms the first one is the log"},{"from":1915.17,"to":1918.68,"location":2,"content":"of the sigmoid of the observed context"},{"from":1918.68,"to":1921.08,"location":2,"content":"word the outside words dot producted"},{"from":1921.08,"to":1923.33,"location":2,"content":"with the center word and we're going to"},{"from":1923.33,"to":1927.02,"location":2,"content":"want that to be big and then on the"},{"from":1927.02,"to":1934.51,"location":2,"content":"other hand we've got the randomly chosen"},{"from":1934.51,"to":1938.6,"location":2,"content":"K words which are just other words and"},{"from":1938.6,"to":1939.98,"location":2,"content":"we're going to work out dot products"},{"from":1939.98,"to":1941.87,"location":2,"content":"between them and the Center word and"},{"from":1941.87,"to":1943.91,"location":2,"content":"we're going to want those to be as small"},{"from":1943.91,"to":1946.58,"location":2,"content":"as possible like that extra minus sign"},{"from":1946.58,"to":1948.98,"location":2,"content":"in there which is causing the sign of"},{"from":1948.98,"to":1951.44,"location":2,"content":"the two things to be different right"},{"from":1951.44,"to":1954.95,"location":2,"content":"today's our negative samples and for big"},{"from":1954.95,"to":1957.29,"location":2,"content":"K it can be reasonably modest number you"},{"from":1957.29,"to":1959.87,"location":2,"content":"can just take kind of 1015 negative"},{"from":1959.87,"to":1960.8,"location":2,"content":"samples"},{"from":1960.8,"to":1964.28,"location":2,"content":"and that works pretty fine I said we"},{"from":1964.28,"to":1967.34,"location":2,"content":"sort of sampled some words to be the"},{"from":1967.34,"to":1969.73,"location":2,"content":"negative samples they in particular"},{"from":1969.73,"to":1973.34,"location":2,"content":"proposed a sampling distribution that"},{"from":1973.34,"to":1976.25,"location":2,"content":"helps them along a little in partly"},{"from":1976.25,"to":1978.47,"location":2,"content":"dealing with this problem of very"},{"from":1978.47,"to":1982.55,"location":2,"content":"frequent words so the starting point of"},{"from":1982.55,"to":1984.77,"location":2,"content":"how you sample words is you use what we"},{"from":1984.77,"to":1987.74,"location":2,"content":"call the a unigram distribution so that"},{"from":1987.74,"to":1989.63,"location":2,"content":"just means you take words in a large"},{"from":1989.63,"to":1992.69,"location":2,"content":"corpus and count up how often each one"},{"from":1992.69,"to":1995.42,"location":2,"content":"occurs just as a count of independent"},{"from":1995.42,"to":1997.25,"location":2,"content":"word so there's the current unigram"},{"from":1997.25,"to":1999.65,"location":2,"content":"counts and so you start off with unigram"},{"from":1999.65,"to":2001.9,"location":2,"content":"counts but then you raise them to the"},{"from":2001.9,"to":2005.08,"location":2,"content":"3/4 power and raising to the 3/4 power"},{"from":2005.08,"to":2008.74,"location":2,"content":"has the effect of decreasing how often"},{"from":2008.74,"to":2010.74,"location":2,"content":"you sample very common words and"},{"from":2010.74,"to":2013.39,"location":2,"content":"increasing how often you sample rarer"},{"from":2013.39,"to":2020.08,"location":2,"content":"words ok and that's that okay so that's"},{"from":2020.08,"to":2023.02,"location":2,"content":"everything about word to Vic I'm going"},{"from":2023.02,"to":2034.57,"location":2,"content":"to say anyone have any lasting yes sorry"},{"from":2034.57,"to":2037.03,"location":2,"content":"see that capital Z is often used as a"},{"from":2037.03,"to":2040.51,"location":2,"content":"normalization term and so this is saying"},{"from":2040.51,"to":2042.22,"location":2,"content":"well if you want the probability"},{"from":2042.22,"to":2044.53,"location":2,"content":"distribution of words is you work out"},{"from":2044.53,"to":2047.32,"location":2,"content":"this 3/4 power of the count of the word"},{"from":2047.32,"to":2049.36,"location":2,"content":"for every word in the vocabulary and"},{"from":2049.36,"to":2051.94,"location":2,"content":"then these numbers you just sum them up"},{"from":2051.94,"to":2053.74,"location":2,"content":"over the vocabulary and it'll be sum"},{"from":2053.74,"to":2056.11,"location":2,"content":"total and we're dividing by that so we"},{"from":2056.11,"to":2057.88,"location":2,"content":"get a probability distribution good"},{"from":2057.88,"to":2059.47,"location":2,"content":"question because I hadn't explained that"},{"from":2059.47,"to":2061.96,"location":2,"content":"in this class when you see the letter Z"},{"from":2061.96,"to":2064.9,"location":2,"content":"with no explanation it normally means I"},{"from":2064.9,"to":2068.47,"location":2,"content":"am a normalization term to turn things"},{"from":2068.47,"to":2070.54,"location":2,"content":"into probabilities and you sort of"},{"from":2070.54,"to":2072.37,"location":2,"content":"iterate over the numerator term and"},{"from":2072.37,"to":2075.37,"location":2,"content":"summing them and divide through any"},{"from":2075.37,"to":2076.96,"location":2,"content":"other questions of things I haven't"},{"from":2076.96,"to":2087.07,"location":2,"content":"explained or otherwise yes yes so it's"},{"from":2087.07,"to":2089.02,"location":2,"content":"nice window do you so I'll actually come"},{"from":2089.02,"to":2091.06,"location":2,"content":"back to that in a bit and show a little"},{"from":2091.06,"to":2093.13,"location":2,"content":"bit of data on that but yeah we haven't"},{"from":2093.13,"to":2094.21,"location":2,"content":"done anything about"},{"from":2094.21,"to":2095.56,"location":2,"content":"that at the moment we're guessing a"},{"from":2095.56,"to":2097.84,"location":2,"content":"window size like five which isn't a bad"},{"from":2097.84,"to":2098.8,"location":2,"content":"one"},{"from":2098.8,"to":2100.99,"location":2,"content":"but you know there isn't there hasn't"},{"from":2100.99,"to":2104.53,"location":2,"content":"really been any science behind that that"},{"from":2104.53,"to":2106.3,"location":2,"content":"people treat that as what's then called"},{"from":2106.3,"to":2109.69,"location":2,"content":"a hyper parameter which means that you"},{"from":2109.69,"to":2111.88,"location":2,"content":"try a few different numbers and see"},{"from":2111.88,"to":2114.13,"location":2,"content":"which one seems best and that's the one"},{"from":2114.13,"to":2118.26,"location":2,"content":"that you use in your future work yeah"},{"from":2118.26,"to":2125.49,"location":2,"content":"our chosen for any theoretical reason no"},{"from":2125.49,"to":2130.24,"location":2,"content":"that that was also chosen as a hyper"},{"from":2130.24,"to":2133.06,"location":2,"content":"parameter that improved performance I"},{"from":2133.06,"to":2136.78,"location":2,"content":"mean actually you know for this word to"},{"from":2136.78,"to":2140.35,"location":2,"content":"vector I mean you know it turns out that"},{"from":2140.35,"to":2145.21,"location":2,"content":"in the actual paper the model looks very"},{"from":2145.21,"to":2148.51,"location":2,"content":"fairly clean but what people discovered"},{"from":2148.51,"to":2150.73,"location":2,"content":"when they started digging through the"},{"from":2150.73,"to":2153.46,"location":2,"content":"code which to to their credit they did"},{"from":2153.46,"to":2155.62,"location":2,"content":"make available reproducible research"},{"from":2155.62,"to":2158.44,"location":2,"content":"that they're actually a whole bunch of"},{"from":2158.44,"to":2161.26,"location":2,"content":"tricks of different things like these"},{"from":2161.26,"to":2164.59,"location":2,"content":"hyper parameters of how you sample and"},{"from":2164.59,"to":2166.9,"location":2,"content":"how you wait windows and various things"},{"from":2166.9,"to":2169.12,"location":2,"content":"to make the numbers better so you know"},{"from":2169.12,"to":2171.13,"location":2,"content":"people play quite a few tricks to make"},{"from":2171.13,"to":2172.72,"location":2,"content":"the numbers go up which aren't"},{"from":2172.72,"to":2201.06,"location":2,"content":"particularly theoretical good sometimes"},{"from":2201.06,"to":2207.28,"location":2,"content":"I so I you so in general for a lot of"},{"from":2207.28,"to":2209.31,"location":2,"content":"these sampling things that's a bad idea"},{"from":2209.31,"to":2211.96,"location":2,"content":"you're going to be doing multiple passes"},{"from":2211.96,"to":2214.18,"location":2,"content":"if you just go boom boom full and then"},{"from":2214.18,"to":2215.95,"location":2,"content":"bloom bloom bloom again that's a bad"},{"from":2215.95,"to":2218.47,"location":2,"content":"idea but a common technique a lot of the"},{"from":2218.47,"to":2220.81,"location":2,"content":"packages use is that they do use a"},{"from":2220.81,"to":2223.06,"location":2,"content":"shuffling operation at the beginning so"},{"from":2223.06,"to":2225.22,"location":2,"content":"for each epoch they'll shuffle the data"},{"from":2225.22,"to":2227.38,"location":2,"content":"randomly and then they'll go through it"},{"from":2227.38,"to":2227.65,"location":2,"content":"and"},{"from":2227.65,"to":2229.78,"location":2,"content":"sequence and that has the benefits of"},{"from":2229.78,"to":2233.47,"location":2,"content":"faster computation from locality etc"},{"from":2233.47,"to":2235.84,"location":2,"content":"while meaning that when you do a"},{"from":2235.84,"to":2237.31,"location":2,"content":"differently pocket will work out"},{"from":2237.31,"to":2249.49,"location":2,"content":"differently yeah that last question I"},{"from":2249.49,"to":2251.55,"location":2,"content":"think was talking about taking the"},{"from":2251.55,"to":2254.04,"location":2,"content":"mini-batches from the corpus and"},{"from":2254.04,"to":2256.21,"location":2,"content":"contrasting whether you actually say"},{"from":2256.21,"to":2258.55,"location":2,"content":"sample 20 randomly from the whole corpus"},{"from":2258.55,"to":2260.47,"location":2,"content":"versus just sort of working from left to"},{"from":2260.47,"to":2278.59,"location":2,"content":"right you have a question you could"},{"from":2278.59,"to":2280.84,"location":2,"content":"argue whether or not this was written in"},{"from":2280.84,"to":2283.24,"location":2,"content":"the clearest way but right so we're"},{"from":2283.24,"to":2286,"location":2,"content":"making this dot product and then when"},{"from":2286,"to":2288.57,"location":2,"content":"the gating eart which is then flipping"},{"from":2288.57,"to":2291.34,"location":2,"content":"which side of the space we're on right"},{"from":2291.34,"to":2294.94,"location":2,"content":"because the sigmoid is symmetric around"},{"from":2294.94,"to":2298.2,"location":2,"content":"zero so if we've got some dot product"},{"from":2298.2,"to":2301.39,"location":2,"content":"and then we negate it we're sort of"},{"from":2301.39,"to":2304.84,"location":2,"content":"working out a 1 minus probability and so"},{"from":2304.84,"to":2307.96,"location":2,"content":"that's the way in which we're actually"},{"from":2307.96,"to":2311.38,"location":2,"content":"for the first term for the first term"},{"from":2311.38,"to":2313.15,"location":2,"content":"we're wanting the probability to be high"},{"from":2313.15,"to":2315.76,"location":2,"content":"and then for the negative samples we're"},{"from":2315.76,"to":2318.81,"location":2,"content":"wanting their probabilities you be low"},{"from":2318.81,"to":2324.85,"location":2,"content":"okay oh maybe run ahead now so this was"},{"from":2324.85,"to":2329.17,"location":2,"content":"an algorithm which sort of you're going"},{"from":2329.17,"to":2331.39,"location":2,"content":"through this corpus position by position"},{"from":2331.39,"to":2334.81,"location":2,"content":"and you're sort of doing this prediction"},{"from":2334.81,"to":2337.36,"location":2,"content":"of words and then you're updating some"},{"from":2337.36,"to":2338.98,"location":2,"content":"parameters and you're learning something"},{"from":2338.98,"to":2341.32,"location":2,"content":"and you know by Jove it seemed to work"},{"from":2341.32,"to":2344.53,"location":2,"content":"based on what we saw in the examples but"},{"from":2344.53,"to":2346.69,"location":2,"content":"you know you might have thought that"},{"from":2346.69,"to":2349.36,"location":2,"content":"that was kind of weird right look we"},{"from":2349.36,"to":2351.79,"location":2,"content":"have this whole big pile of data you"},{"from":2351.79,"to":2355.27,"location":2,"content":"know sort of traditional I thinking of"},{"from":2355.27,"to":2357.46,"location":2,"content":"Statistics right if you have a big pile"},{"from":2357.46,"to":2360.19,"location":2,"content":"of data you a granade it and it sort of"},{"from":2360.19,"to":2361.51,"location":2,"content":"seems like there obviously"},{"from":2361.51,"to":2363.19,"location":2,"content":"you could do here you could say well"},{"from":2363.19,"to":2365.77,"location":2,"content":"there's a word like whatever word we're"},{"from":2365.77,"to":2368.23,"location":2,"content":"using banana let's just see what words"},{"from":2368.23,"to":2370.72,"location":2,"content":"occur in the context of a gut banana and"},{"from":2370.72,"to":2372.67,"location":2,"content":"count them all up and then we'll be able"},{"from":2372.67,"to":2375.22,"location":2,"content":"to use those to predict somehow and you"},{"from":2375.22,"to":2377.58,"location":2,"content":"know those kinds of methods were"},{"from":2377.58,"to":2380.95,"location":2,"content":"traditionally used including even with"},{"from":2380.95,"to":2383.8,"location":2,"content":"distributed representation techniques so"},{"from":2383.8,"to":2385.51,"location":2,"content":"I want to say a bit about that so you're"},{"from":2385.51,"to":2388.21,"location":2,"content":"fully educated and don't sound like one"},{"from":2388.21,"to":2390.85,"location":2,"content":"of those people who were aware of no"},{"from":2390.85,"to":2394.03,"location":2,"content":"work that happened before 2013 when your"},{"from":2394.03,"to":2397.63,"location":2,"content":"networks took off okay so what we could"},{"from":2397.63,"to":2400.12,"location":2,"content":"do is we can essentially do the same"},{"from":2400.12,"to":2403.6,"location":2,"content":"thing as sort of word to Veck we could"},{"from":2403.6,"to":2406.93,"location":2,"content":"say there's a five word window around"},{"from":2406.93,"to":2409.21,"location":2,"content":"each word instance that's often referred"},{"from":2409.21,"to":2411.91,"location":2,"content":"to as a word token right so in NLP we"},{"from":2411.91,"to":2413.68,"location":2,"content":"often want to distinguish between a"},{"from":2413.68,"to":2417.63,"location":2,"content":"particular kind of type like banana or"},{"from":2417.63,"to":2420.64,"location":2,"content":"Apple versus particular instances of an"},{"from":2420.64,"to":2422.35,"location":2,"content":"in the text and that's referred to sort"},{"from":2422.35,"to":2425.58,"location":2,"content":"of a type token distinction so we could"},{"from":2425.58,"to":2429.82,"location":2,"content":"look at each token of a word and the"},{"from":2429.82,"to":2431.68,"location":2,"content":"words five around that and then we"},{"from":2431.68,"to":2433.93,"location":2,"content":"should so start counting up which words"},{"from":2433.93,"to":2436.84,"location":2,"content":"occur occur with it and so we can then"},{"from":2436.84,"to":2441.84,"location":2,"content":"have a matrix of co-occurrence counts"},{"from":2441.84,"to":2445,"location":2,"content":"okay so we'll have again and I'm going"},{"from":2445,"to":2446.62,"location":2,"content":"to give an example of this so normally"},{"from":2446.62,"to":2448.87,"location":2,"content":"again use a five to ten but you know I"},{"from":2448.87,"to":2451.03,"location":2,"content":"can just use a window of one to keep my"},{"from":2451.03,"to":2453.61,"location":2,"content":"counts very simple and small I ignore"},{"from":2453.61,"to":2455.98,"location":2,"content":"left or right just like word defected"},{"from":2455.98,"to":2458.47,"location":2,"content":"and so if I have a teeny baby corpus"},{"from":2458.47,"to":2460.93,"location":2,"content":"like this you know what I could do is"},{"from":2460.93,"to":2463.42,"location":2,"content":"just say here's a matrix of word"},{"from":2463.42,"to":2466.69,"location":2,"content":"co-occurrence accounts so within my"},{"from":2466.69,"to":2469.54,"location":2,"content":"window size of one eye occurs next to"},{"from":2469.54,"to":2471.88,"location":2,"content":"like twice and that means that like"},{"from":2471.88,"to":2474.7,"location":2,"content":"occurs next why twice it's symmetric and"},{"from":2474.7,"to":2476.8,"location":2,"content":"all my other counts here are single"},{"from":2476.8,"to":2481.44,"location":2,"content":"turns and so this gives me a big huge"},{"from":2481.44,"to":2484.66,"location":2,"content":"sparse matrix of word co-occurrence"},{"from":2484.66,"to":2486.88,"location":2,"content":"accounts and so one thing that you could"},{"from":2486.88,"to":2489.31,"location":2,"content":"do is just use this matrix directly"},{"from":2489.31,"to":2492.04,"location":2,"content":"because I haven't really got enough data"},{"from":2492.04,"to":2495.6,"location":2,"content":"here but you know if you sort of"},{"from":2495.6,"to":2499.18,"location":2,"content":"decided that you know the word like is"},{"from":2499.18,"to":2501.76,"location":2,"content":"like the word learning what you do is"},{"from":2501.76,"to":2503.77,"location":2,"content":"you'd expect that these two vectors"},{"from":2503.77,"to":2506.17,"location":2,"content":"would end up kind of similar to each"},{"from":2506.17,"to":2509.85,"location":2,"content":"other they do so you could just measure"},{"from":2509.85,"to":2513.07,"location":2,"content":"similarity of the vectors directly in"},{"from":2513.07,"to":2515.8,"location":2,"content":"terms of these co-occurrence counts but"},{"from":2515.8,"to":2517.96,"location":2,"content":"you know it's a little bit unappealing"},{"from":2517.96,"to":2520.36,"location":2,"content":"doing things this way right if you have"},{"from":2520.36,"to":2522.73,"location":2,"content":"a quarter million word vocabulary that's"},{"from":2522.73,"to":2525.85,"location":2,"content":"where you're in this space where my math"},{"from":2525.85,"to":2527.83,"location":2,"content":"is bad but it's in the trillions of the"},{"from":2527.83,"to":2530.17,"location":2,"content":"number of cells of this matrix might"},{"from":2530.17,"to":2532.21,"location":2,"content":"require a lot of storage though if"},{"from":2532.21,"to":2533.65,"location":2,"content":"you're clever and notice that most of"},{"from":2533.65,"to":2535.66,"location":2,"content":"the cells were zero and could do some"},{"from":2535.66,"to":2538.27,"location":2,"content":"clever sparse matrix representation"},{"from":2538.27,"to":2540.46,"location":2,"content":"might take a little bit less your"},{"from":2540.46,"to":2542.47,"location":2,"content":"classification models might have sparse"},{"from":2542.47,"to":2543.97,"location":2,"content":"the issues because you know a lot of"},{"from":2543.97,"to":2545.92,"location":2,"content":"those cells aren't present and so it"},{"from":2545.92,"to":2548.08,"location":2,"content":"might not be very robust and so there"},{"from":2548.08,"to":2549.97,"location":2,"content":"was a traditional answer to all of these"},{"from":2549.97,"to":2552.97,"location":2,"content":"things which is well maybe we can have"},{"from":2552.97,"to":2556.03,"location":2,"content":"that big co-occurrence count matrix and"},{"from":2556.03,"to":2559.24,"location":2,"content":"somehow reduce its dimensionality I've"},{"from":2559.24,"to":2563.08,"location":2,"content":"just find a corresponding low"},{"from":2563.08,"to":2566.23,"location":2,"content":"dimensional matrix which preserves most"},{"from":2566.23,"to":2569.14,"location":2,"content":"of the information in the original"},{"from":2569.14,"to":2571.87,"location":2,"content":"matrix and you know maybe all reduce"},{"from":2571.87,"to":2574.36,"location":2,"content":"things to a dimensionality of somewhere"},{"from":2574.36,"to":2577.27,"location":2,"content":"around the size 25 to a thousand has"},{"from":2577.27,"to":2580.09,"location":2,"content":"done with word Davich so there's sort of"},{"from":2580.09,"to":2582.7,"location":2,"content":"a standard most common way of doing this"},{"from":2582.7,"to":2585.7,"location":2,"content":"dimensionality reduction and you don't"},{"from":2585.7,"to":2587.11,"location":2,"content":"really have to understand all the math"},{"from":2587.11,"to":2588.49,"location":2,"content":"but you get to play with this in"},{"from":2588.49,"to":2592.36,"location":2,"content":"homework 1 which is for any matrix you"},{"from":2592.36,"to":2594.1,"location":2,"content":"can do what's called the singular value"},{"from":2594.1,"to":2596.8,"location":2,"content":"decomposition which is a way you can"},{"from":2596.8,"to":2599.59,"location":2,"content":"take an arbitrary matrix and decompose"},{"from":2599.59,"to":2604.63,"location":2,"content":"it into 3 matrices where the center one"},{"from":2604.63,"to":2607.27,"location":2,"content":"is diagonal and has what in it what are"},{"from":2607.27,"to":2609.34,"location":2,"content":"called singular vectors which are"},{"from":2609.34,"to":2611.14,"location":2,"content":"weightings of the different dimensions"},{"from":2611.14,"to":2613.84,"location":2,"content":"so they decrease in size as you go"},{"from":2613.84,"to":2617.05,"location":2,"content":"downwards and then these two U and V and"},{"from":2617.05,"to":2620.32,"location":2,"content":"then orthogonal basis corresponding to"},{"from":2620.32,"to":2622.57,"location":2,"content":"the rows and columns and so in"},{"from":2622.57,"to":2625.03,"location":2,"content":"particular it's even simpler in the case"},{"from":2625.03,"to":2626.65,"location":2,"content":"where we just have these word word"},{"from":2626.65,"to":2628.99,"location":2,"content":"vectors because you have a square matrix"},{"from":2628.99,"to":2631.36,"location":2,"content":"and so they're effectively the same but"},{"from":2631.36,"to":2633.73,"location":2,"content":"you know for the general case although"},{"from":2633.73,"to":2636.37,"location":2,"content":"you get these sort of full orthogonal"},{"from":2636.37,"to":2639.46,"location":2,"content":"basis you then have these bits sort of"},{"from":2639.46,"to":2641.08,"location":2,"content":"don't really matter cuz they end up"},{"from":2641.08,"to":2642.7,"location":2,"content":"being used for nothing when you work out"},{"from":2642.7,"to":2645.88,"location":2,"content":"the product and then if you want to"},{"from":2645.88,"to":2648.64,"location":2,"content":"reduce the dimensionality what you say"},{"from":2648.64,"to":2651.31,"location":2,"content":"is throw away the smallest singular"},{"from":2651.31,"to":2654.04,"location":2,"content":"values which remember they're in"},{"from":2654.04,"to":2656.65,"location":2,"content":"decreasing size and that means you're"},{"from":2656.65,"to":2659.26,"location":2,"content":"then effectively throwing away rows and"},{"from":2659.26,"to":2662.47,"location":2,"content":"columns of these other matrices and then"},{"from":2662.47,"to":2665.05,"location":2,"content":"it says behold I've now reduced these"},{"from":2665.05,"to":2666.27,"location":2,"content":"things to a two dimensional"},{"from":2666.27,"to":2669.13,"location":2,"content":"representation from the original three"},{"from":2669.13,"to":2671.17,"location":2,"content":"dimensional representation and that's"},{"from":2671.17,"to":2674.59,"location":2,"content":"referred to as the reduced SVD and the"},{"from":2674.59,"to":2677.44,"location":2,"content":"classic result is in terms of least"},{"from":2677.44,"to":2680.98,"location":2,"content":"squares error in estimation that this"},{"from":2680.98,"to":2684.25,"location":2,"content":"the product of these three things will"},{"from":2684.25,"to":2688.84,"location":2,"content":"give XK which is the best Kate rank K"},{"from":2688.84,"to":2691.81,"location":2,"content":"approximation to the original X in terms"},{"from":2691.81,"to":2694.72,"location":2,"content":"of a x squared least squares criterion"},{"from":2694.72,"to":2697.57,"location":2,"content":"so we could do this and we could build"},{"from":2697.57,"to":2703.06,"location":2,"content":"word vectors so I can make use of num"},{"from":2703.06,"to":2706.48,"location":2,"content":"PI's SVD function and I can throw into"},{"from":2706.48,"to":2708.21,"location":2,"content":"it"},{"from":2708.21,"to":2713.92,"location":2,"content":"matrices and I can make word vectors and"},{"from":2713.92,"to":2716.14,"location":2,"content":"these ones look really bad but hey I"},{"from":2716.14,"to":2718.65,"location":2,"content":"give it a data set of three sentences"},{"from":2718.65,"to":2721.6,"location":2,"content":"exactly a fair comparison but so this"},{"from":2721.6,"to":2726.49,"location":2,"content":"technique was in popularized around the"},{"from":2726.49,"to":2729.01,"location":2,"content":"term the turn of the millennium it"},{"from":2729.01,"to":2731.47,"location":2,"content":"generally went for some word"},{"from":2731.47,"to":2733.48,"location":2,"content":"applications under the name of latent"},{"from":2733.48,"to":2735.82,"location":2,"content":"semantic analysis or latent semantic"},{"from":2735.82,"to":2738.31,"location":2,"content":"indexing and the idea was that you would"},{"from":2738.31,"to":2741.34,"location":2,"content":"have these semantic directions that you"},{"from":2741.34,"to":2743.02,"location":2,"content":"were finding in this low dimensional"},{"from":2743.02,"to":2745.39,"location":2,"content":"space that had meaning and people worked"},{"from":2745.39,"to":2747.55,"location":2,"content":"with it quite a bit for techniques like"},{"from":2747.55,"to":2750.37,"location":2,"content":"do trying to do information retrieval"},{"from":2750.37,"to":2754.69,"location":2,"content":"using these LS a approximations and it's"},{"from":2754.69,"to":2757.99,"location":2,"content":"sort of worked a bit it kind of never"},{"from":2757.99,"to":2762.95,"location":2,"content":"really worked very well I think and so"},{"from":2762.95,"to":2767.27,"location":2,"content":"never sort of hugely caught on but it's"},{"from":2767.27,"to":2769.49,"location":2,"content":"so the myth it's kind of continued to be"},{"from":2769.49,"to":2771.74,"location":2,"content":"explored actually mainly in the sort of"},{"from":2771.74,"to":2774.5,"location":2,"content":"cog psyche of cogs site community where"},{"from":2774.5,"to":2776.27,"location":2,"content":"people were doing things with word"},{"from":2776.27,"to":2778.55,"location":2,"content":"meaning and there's this sort of kind of"},{"from":2778.55,"to":2779.96,"location":2,"content":"interesting"},{"from":2779.96,"to":2783.08,"location":2,"content":"lacunae to the literature that there was"},{"from":2783.08,"to":2787.43,"location":2,"content":"this guy Doug roadie who did a PhD at"},{"from":2787.43,"to":2792.74,"location":2,"content":"CMU in 2005 and basically what he"},{"from":2792.74,"to":2795.92,"location":2,"content":"discovered was look if rather than just"},{"from":2795.92,"to":2799.82,"location":2,"content":"using law counts I start doing quite a"},{"from":2799.82,"to":2803.24,"location":2,"content":"bit more in terms of you know fiddling"},{"from":2803.24,"to":2805.58,"location":2,"content":"with the counts I can start to produce"},{"from":2805.58,"to":2808.04,"location":2,"content":"results that are much better so rather"},{"from":2808.04,"to":2810.2,"location":2,"content":"than using war counts you have to do"},{"from":2810.2,"to":2811.76,"location":2,"content":"something to deal with those very"},{"from":2811.76,"to":2814.58,"location":2,"content":"high-frequency words so one idea is you"},{"from":2814.58,"to":2816.2,"location":2,"content":"could log scale them which is also"},{"from":2816.2,"to":2817.9,"location":2,"content":"commonly used in information retrieval"},{"from":2817.9,"to":2820.76,"location":2,"content":"another idea is you could just use"},{"from":2820.76,"to":2823.64,"location":2,"content":"something like a sealing function so you"},{"from":2823.64,"to":2826.76,"location":2,"content":"take the minimum of X comma T for T set"},{"from":2826.76,"to":2829.54,"location":2,"content":"and a some number like around a hundred"},{"from":2829.54,"to":2833.48,"location":2,"content":"he had he used the idea which was also"},{"from":2833.48,"to":2835.49,"location":2,"content":"another of the hex that was put into the"},{"from":2835.49,"to":2837.65,"location":2,"content":"word Tyvek was rather than just you"},{"from":2837.65,"to":2840.38,"location":2,"content":"treating the whole window the same that"},{"from":2840.38,"to":2842.84,"location":2,"content":"you should count words that are closer"},{"from":2842.84,"to":2845.48,"location":2,"content":"more so in word to vaca"},{"from":2845.48,"to":2847.88,"location":2,"content":"they sample closer words more commonly"},{"from":2847.88,"to":2850.7,"location":2,"content":"than further away words in his system"},{"from":2850.7,"to":2851.66,"location":2,"content":"you're sort of having to have a"},{"from":2851.66,"to":2854.54,"location":2,"content":"differential count for closer words etc"},{"from":2854.54,"to":2858.53,"location":2,"content":"and then compared to any of that rather"},{"from":2858.53,"to":2861.23,"location":2,"content":"than using counts at all he then started"},{"from":2861.23,"to":2864.23,"location":2,"content":"using Pearson correlations which helped"},{"from":2864.23,"to":2866.63,"location":2,"content":"except they're sometimes negative and he"},{"from":2866.63,"to":2870.77,"location":2,"content":"decided that it helped if you then got"},{"from":2870.77,"to":2873.74,"location":2,"content":"rid of the negative values so in in some"},{"from":2873.74,"to":2875.56,"location":2,"content":"sense this sounds like a bag of hacks"},{"from":2875.56,"to":2879.14,"location":2,"content":"but on the other hand he was able to"},{"from":2879.14,"to":2881.84,"location":2,"content":"show that you know these transformed"},{"from":2881.84,"to":2884.27,"location":2,"content":"counts could actually then give you very"},{"from":2884.27,"to":2886.67,"location":2,"content":"useful word vectors as I'm about to show"},{"from":2886.67,"to":2890.09,"location":2,"content":"and well we have to realize that"},{"from":2890.09,"to":2893.03,"location":2,"content":"actually in slightly different forms"},{"from":2893.03,"to":2895.1,"location":2,"content":"several of these exact same counts are"},{"from":2895.1,"to":2896.6,"location":2,"content":"actually being used in word today as"},{"from":2896.6,"to":2906.1,"location":2,"content":"well"},{"from":2906.1,"to":2909.71,"location":2,"content":"yeah so so that's Annie I'm about to"},{"from":2909.71,"to":2912.44,"location":2,"content":"show exactly that that's actually a"},{"from":2912.44,"to":2915.35,"location":2,"content":"really interesting little bit of the"},{"from":2915.35,"to":2920.57,"location":2,"content":"data so yeah what yeah so the the thing"},{"from":2920.57,"to":2923,"location":2,"content":"if you do that you not only get word"},{"from":2923,"to":2925.64,"location":2,"content":"similarities are pretty good let me show"},{"from":2925.64,"to":2929.3,"location":2,"content":"you this example which is cleaner so"},{"from":2929.3,"to":2933.65,"location":2,"content":"this the precise idea of evaluating with"},{"from":2933.65,"to":2936.23,"location":2,"content":"analogies was not something that had"},{"from":2936.23,"to":2937.73,"location":2,"content":"really been developed so that was"},{"from":2937.73,"to":2940.03,"location":2,"content":"actually something that marsh mica love"},{"from":2940.03,"to":2944.93,"location":2,"content":"suggested but actually dug roadie made"},{"from":2944.93,"to":2948.77,"location":2,"content":"this really interesting observation"},{"from":2948.77,"to":2952.49,"location":2,"content":"which was he said look once I do these"},{"from":2952.49,"to":2955.01,"location":2,"content":"kind of transformations to improve the"},{"from":2955.01,"to":2956.87,"location":2,"content":"semantic representation of my word"},{"from":2956.87,"to":2960.05,"location":2,"content":"vectors look this really interesting"},{"from":2960.05,"to":2963.8,"location":2,"content":"property emerges that what you find is"},{"from":2963.8,"to":2967.64,"location":2,"content":"that there are semantic vectors which"},{"from":2967.64,"to":2970.85,"location":2,"content":"are basically linear components in my"},{"from":2970.85,"to":2973.58,"location":2,"content":"carefully constructed space so here we"},{"from":2973.58,"to":2976.73,"location":2,"content":"have the sort of verb to the doer of the"},{"from":2976.73,"to":2980.6,"location":2,"content":"verb Direction drive driver clean"},{"from":2980.6,"to":2984.44,"location":2,"content":"janitor swim swimmer learn teach or"},{"from":2984.44,"to":2988.61,"location":2,"content":"teach teacher doctor trade priest pray I"},{"from":2988.61,"to":2990.53,"location":2,"content":"mean you know it's not exactly perfect"},{"from":2990.53,"to":2992.18,"location":2,"content":"you know there's a little bit of wiggle"},{"from":2992.18,"to":2994.91,"location":2,"content":"there right but you know roughly it's"},{"from":2994.91,"to":2996.92,"location":2,"content":"completely clear that there's sort of a"},{"from":2996.92,"to":2999.71,"location":2,"content":"direction in the space that corresponds"},{"from":2999.71,"to":3003.97,"location":2,"content":"to from a verb to the doer of a verb and"},{"from":3003.97,"to":3007.18,"location":2,"content":"yeah so he hem to scott he no one has"},{"from":3007.18,"to":3008.56,"location":2,"content":"thought of this idea of doing the"},{"from":3008.56,"to":3012.43,"location":2,"content":"analogies its tests but the thing in"},{"from":3012.43,"to":3015.43,"location":2,"content":"retrospect that's obvious is if you can"},{"from":3015.43,"to":3018.22,"location":2,"content":"construct a vector space that has this"},{"from":3018.22,"to":3021.82,"location":2,"content":"linearity property then you're"},{"from":3021.82,"to":3023.59,"location":2,"content":"definitely going to do well in analogies"},{"from":3023.59,"to":3025.87,"location":2,"content":"so effectively he had invented a vector"},{"from":3025.87,"to":3027.79,"location":2,"content":"space that do well in analogies because"},{"from":3027.79,"to":3030.33,"location":2,"content":"this means that you've got there"},{"from":3030.33,"to":3033.12,"location":2,"content":"direction which is the doer and then you"},{"from":3033.12,"to":3034.77,"location":2,"content":"can immediately say that's the doer"},{"from":3034.77,"to":3036.12,"location":2,"content":"vector which you can get from"},{"from":3036.12,"to":3038.19,"location":2,"content":"subtracting clean from Schwimmer and"},{"from":3038.19,"to":3040.77,"location":2,"content":"that's right so clean from janitor and"},{"from":3040.77,"to":3043.38,"location":2,"content":"then we can add it on to swim and we'll"},{"from":3043.38,"to":3046.2,"location":2,"content":"get somewhere close to swimmer so his"},{"from":3046.2,"to":3048.03,"location":2,"content":"space actually did do that"},{"from":3048.03,"to":3052.8,"location":2,"content":"and so this isn't so the mole in some"},{"from":3052.8,"to":3056.16,"location":2,"content":"sense is if you have if you kind of do"},{"from":3056.16,"to":3058.17,"location":2,"content":"carefully control accounts and so on"},{"from":3058.17,"to":3061.23,"location":2,"content":"that conventional methods can also give"},{"from":3061.23,"to":3064.68,"location":2,"content":"you good word vector spaces and I mean"},{"from":3064.68,"to":3066.18,"location":2,"content":"so that was actually the starting off"},{"from":3066.18,"to":3070.2,"location":2,"content":"point for our work on glove so that"},{"from":3070.2,"to":3071.64,"location":2,"content":"essentially there'd been these two"},{"from":3071.64,"to":3075.51,"location":2,"content":"schools of work there had been this"},{"from":3075.51,"to":3077.82,"location":2,"content":"school of work that been explored more"},{"from":3077.82,"to":3080.52,"location":2,"content":"in cog side than anywhere else which had"},{"from":3080.52,"to":3082.89,"location":2,"content":"been based on counting and transforming"},{"from":3082.89,"to":3085.77,"location":2,"content":"counts and you know it had some"},{"from":3085.77,"to":3088.11,"location":2,"content":"advantages or it seemed it had some"},{"from":3088.11,"to":3091.44,"location":2,"content":"advantages right that you're making sort"},{"from":3091.44,"to":3093.33,"location":2,"content":"of efficient use of statistics as you're"},{"from":3093.33,"to":3095.16,"location":2,"content":"using the global statistics of the whole"},{"from":3095.16,"to":3098.58,"location":2,"content":"matrix directly to estimate things and"},{"from":3098.58,"to":3101.88,"location":2,"content":"at that point up until then had really"},{"from":3101.88,"to":3103.56,"location":2,"content":"only been used to capture words"},{"from":3103.56,"to":3106.89,"location":2,"content":"similarity and a lot of it had suffered"},{"from":3106.89,"to":3109.65,"location":2,"content":"from disproportionate input importance"},{"from":3109.65,"to":3112.71,"location":2,"content":"given to large counts but Doug rody it"},{"from":3112.71,"to":3114.45,"location":2,"content":"sort of started to show how to solve"},{"from":3114.45,"to":3116.94,"location":2,"content":"both of these problems and so on the"},{"from":3116.94,"to":3118.62,"location":2,"content":"other hand there'd been these neural"},{"from":3118.62,"to":3120.75,"location":2,"content":"network methods which are kind of direct"},{"from":3120.75,"to":3122.85,"location":2,"content":"prediction methods that we were defining"},{"from":3122.85,"to":3125.1,"location":2,"content":"that probability distribution and trying"},{"from":3125.1,"to":3127.5,"location":2,"content":"to predict the words that occur and they"},{"from":3127.5,"to":3130.5,"location":2,"content":"had some advantages right the fact that"},{"from":3130.5,"to":3132.72,"location":2,"content":"you're sampling means that you're not"},{"from":3132.72,"to":3133.86,"location":2,"content":"going to run out of memory"},{"from":3133.86,"to":3136.08,"location":2,"content":"hopefully I know we've had some memory"},{"from":3136.08,"to":3137.58,"location":2,"content":"problems with homework one"},{"from":3137.58,"to":3139.2,"location":2,"content":"but in principle you're not as bad a"},{"from":3139.2,"to":3141.45,"location":2,"content":"memory position and if you have to"},{"from":3141.45,"to":3143.4,"location":2,"content":"construct a huge matrix because you're"},{"from":3143.4,"to":3146.25,"location":2,"content":"going linearly but you know since you're"},{"from":3146.25,"to":3148.59,"location":2,"content":"doing it sample by sample as inefficient"},{"from":3148.59,"to":3153.96,"location":2,"content":"use of Statistics okay and so but on the"},{"from":3153.96,"to":3155.55,"location":2,"content":"other hand Michael loves work it"},{"from":3155.55,"to":3158.16,"location":2,"content":"performed perfectly off not perfectly"},{"from":3158.16,"to":3161.31,"location":2,"content":"that really well so this sort of led"},{"from":3161.31,"to":3163.07,"location":2,"content":"into this work"},{"from":3163.07,"to":3165.77,"location":2,"content":"that Jeffrey Pennington which is social"},{"from":3165.77,"to":3168.83,"location":2,"content":"media dove can we sort of combine these"},{"from":3168.83,"to":3172.39,"location":2,"content":"ideas and sort of have some of the"},{"from":3172.39,"to":3176.05,"location":2,"content":"goodness of the neural net methods while"},{"from":3176.05,"to":3179.18,"location":2,"content":"trying to do things with some kind of"},{"from":3179.18,"to":3182.87,"location":2,"content":"count matrix and so in particular we"},{"from":3182.87,"to":3185,"location":2,"content":"wanted to get the result in a slightly"},{"from":3185,"to":3188.36,"location":2,"content":"less hacky way that you want to have"},{"from":3188.36,"to":3191.03,"location":2,"content":"components of meaning being linear"},{"from":3191.03,"to":3192.02,"location":2,"content":"operative"},{"from":3192.02,"to":3193.97,"location":2,"content":"linear operations in the vector space"},{"from":3193.97,"to":3196.25,"location":2,"content":"that they're just some effector you're"},{"from":3196.25,"to":3198.59,"location":2,"content":"adding or something like this and so the"},{"from":3198.59,"to":3200.93,"location":2,"content":"crucial observation of this model was"},{"from":3200.93,"to":3203.72,"location":2,"content":"that we could use ratios of"},{"from":3203.72,"to":3206.03,"location":2,"content":"co-occurrence probabilities to encode"},{"from":3206.03,"to":3208.7,"location":2,"content":"meaning components and so the idea here"},{"from":3208.7,"to":3212.24,"location":2,"content":"is if you have a word like ice and you"},{"from":3212.24,"to":3213.92,"location":2,"content":"say how often the things going to"},{"from":3213.92,"to":3215.18,"location":2,"content":"co-occur with that"},{"from":3215.18,"to":3218.03,"location":2,"content":"well solid should co-occur a lot and gas"},{"from":3218.03,"to":3221,"location":2,"content":"should end but well water is also going"},{"from":3221,"to":3223.85,"location":2,"content":"to co-occur a lot and some random word"},{"from":3223.85,"to":3229.91,"location":2,"content":"won't occur much if you have oops if you"},{"from":3229.91,"to":3233.72,"location":2,"content":"have steam you get the opposite pattern"},{"from":3233.72,"to":3237.08,"location":2,"content":"with solid and gas right but so the"},{"from":3237.08,"to":3240.05,"location":2,"content":"thing to notice is it's not enough to"},{"from":3240.05,"to":3242.15,"location":2,"content":"just have large by itself because large"},{"from":3242.15,"to":3244.82,"location":2,"content":"appears both here and here or small"},{"from":3244.82,"to":3246.92,"location":2,"content":"appears there and there the thing that's"},{"from":3246.92,"to":3248.75,"location":2,"content":"interesting is sort of the difference"},{"from":3248.75,"to":3250.91,"location":2,"content":"between these components and they're"},{"from":3250.91,"to":3253.46,"location":2,"content":"indicating a meaning component and so we"},{"from":3253.46,"to":3257.21,"location":2,"content":"can get at that if we look at the ratio"},{"from":3257.21,"to":3260.66,"location":2,"content":"of co-occurrence probabilities and so"},{"from":3260.66,"to":3262.54,"location":2,"content":"for the ratio co-occurrence"},{"from":3262.54,"to":3265.76,"location":2,"content":"probabilities this is a dimension of"},{"from":3265.76,"to":3269.96,"location":2,"content":"meaning and where for other words this"},{"from":3269.96,"to":3272.66,"location":2,"content":"sort of ratio cancels out to about one"},{"from":3272.66,"to":3276.02,"location":2,"content":"and so in this slide I've moved so it's"},{"from":3276.02,"to":3278.81,"location":2,"content":"not my small and large but these are"},{"from":3278.81,"to":3280.94,"location":2,"content":"actually actual counts from a corpus so"},{"from":3280.94,"to":3283.22,"location":2,"content":"we roughly get dimension of meaning"},{"from":3283.22,"to":3285.56,"location":2,"content":"between solid and gas are the ones"},{"from":3285.56,"to":3287.57,"location":2,"content":"coming out is about one because they're"},{"from":3287.57,"to":3290.81,"location":2,"content":"not the dimension of meaning and so it"},{"from":3290.81,"to":3292.73,"location":2,"content":"seems like what we want is we want to"},{"from":3292.73,"to":3294.86,"location":2,"content":"have ratio of co-occurrence"},{"from":3294.86,"to":3296.54,"location":2,"content":"probabilities"},{"from":3296.54,"to":3299.21,"location":2,"content":"come linear in our space and then we're"},{"from":3299.21,"to":3301.82,"location":2,"content":"in a good business and so that's what we"},{"from":3301.82,"to":3304.37,"location":2,"content":"want to set about doing well how can you"},{"from":3304.37,"to":3307.48,"location":2,"content":"do that well the way you can do that is"},{"from":3307.48,"to":3311.06,"location":2,"content":"by if you can make the dot products"},{"from":3311.06,"to":3314.39,"location":2,"content":"equal to the log of the co-occurrence"},{"from":3314.39,"to":3318.02,"location":2,"content":"probability then immediately you get the"},{"from":3318.02,"to":3319.97,"location":2,"content":"fact that when you have a vector"},{"from":3319.97,"to":3324.02,"location":2,"content":"difference it turns into a ratio of the"},{"from":3324.02,"to":3327.59,"location":2,"content":"co-occurrence probabilities and so"},{"from":3327.59,"to":3330.02,"location":2,"content":"essentially the whole of the model is"},{"from":3330.02,"to":3332.03,"location":2,"content":"that we want to have dot products the"},{"from":3332.03,"to":3334.31,"location":2,"content":"logs of co-occurrence probabilities and"},{"from":3334.31,"to":3337.73,"location":2,"content":"so that's what we do so here is our"},{"from":3337.73,"to":3341.21,"location":2,"content":"objective function here and it's made to"},{"from":3341.21,"to":3343.55,"location":2,"content":"look a little bit more complicated but"},{"from":3343.55,"to":3346.16,"location":2,"content":"essentially we've got the squared loss"},{"from":3346.16,"to":3349.67,"location":2,"content":"here and then we're wanting to say the"},{"from":3349.67,"to":3351.8,"location":2,"content":"dot product should be as similar as"},{"from":3351.8,"to":3354.95,"location":2,"content":"possible to the log of the co-occurrence"},{"from":3354.95,"to":3357.35,"location":2,"content":"probability and so you'll they'll be"},{"from":3357.35,"to":3359.87,"location":2,"content":"lost to the extent that they're not the"},{"from":3359.87,"to":3362.6,"location":2,"content":"same but we kind of complexified a"},{"from":3362.6,"to":3365.48,"location":2,"content":"little by putting in biased terms for"},{"from":3365.48,"to":3368.03,"location":2,"content":"both of the two words because maybe the"},{"from":3368.03,"to":3370.16,"location":2,"content":"word is just overall common and likes to"},{"from":3370.16,"to":3373.28,"location":2,"content":"co-occur things or uncommon or doesn't"},{"from":3373.28,"to":3375.74,"location":2,"content":"and then we do one more little trick"},{"from":3375.74,"to":3377.48,"location":2,"content":"because every pun does tricks to make"},{"from":3377.48,"to":3379.76,"location":2,"content":"the performance better is that we also"},{"from":3379.76,"to":3383.48,"location":2,"content":"use this F function in front so that"},{"from":3383.48,"to":3385.31,"location":2,"content":"we're sort of capping the effect that"},{"from":3385.31,"to":3388.34,"location":2,"content":"very common word pairs can have on the"},{"from":3388.34,"to":3391.76,"location":2,"content":"performance of the system okay and so"},{"from":3391.76,"to":3394.04,"location":2,"content":"that gave us the glove model of word"},{"from":3394.04,"to":3398.96,"location":2,"content":"vectors and theoretically the interest"},{"from":3398.96,"to":3401.51,"location":2,"content":"of this was you know a lot of the"},{"from":3401.51,"to":3403.34,"location":2,"content":"preceding literature had been there been"},{"from":3403.34,"to":3404.87,"location":2,"content":"these count methods and there'd been"},{"from":3404.87,"to":3407.39,"location":2,"content":"these prediction methods and the hope"},{"from":3407.39,"to":3409.67,"location":2,"content":"was that this could sort of unify the"},{"from":3409.67,"to":3412.19,"location":2,"content":"two by showing you how you could have a"},{"from":3412.19,"to":3416,"location":2,"content":"method that is estimated simply off a"},{"from":3416,"to":3418.16,"location":2,"content":"count matrix but it's done in the same"},{"from":3418.16,"to":3420.95,"location":2,"content":"kind of iterative loss based estimation"},{"from":3420.95,"to":3422.75,"location":2,"content":"method that's used for the newer methods"},{"from":3422.75,"to":3425.78,"location":2,"content":"to get good word vectors and this also"},{"from":3425.78,"to":3427.49,"location":2,"content":"worked to give good word vectors so"},{"from":3427.49,"to":3430.31,"location":2,"content":"here's glove results for the word fraud"},{"from":3430.31,"to":3434.33,"location":2,"content":"and fog fogs and todor obvious but you"},{"from":3434.33,"to":3436.33,"location":2,"content":"know these different kinds of words"},{"from":3436.33,"to":3439.1,"location":2,"content":"various kinds of pretty tree frogs and"},{"from":3439.1,"to":3441.16,"location":2,"content":"things like that"},{"from":3441.16,"to":3445.91,"location":2,"content":"okay so I'll then go from here and say a"},{"from":3445.91,"to":3448.58,"location":2,"content":"little bit more about some of the work"},{"from":3448.58,"to":3451.67,"location":2,"content":"on evaluating word vectors and this is"},{"from":3451.67,"to":3453.44,"location":2,"content":"maybe also a chance just talking a"},{"from":3453.44,"to":3455.83,"location":2,"content":"little bit about evaluation all together"},{"from":3455.83,"to":3459.89,"location":2,"content":"so normally an NLP when we do evaluation"},{"from":3459.89,"to":3461.81,"location":2,"content":"the first thing that comes up is"},{"from":3461.81,"to":3465.56,"location":2,"content":"intrinsic versus extrinsic evaluation so"},{"from":3465.56,"to":3466.94,"location":2,"content":"normally if there's something we're"},{"from":3466.94,"to":3471.38,"location":2,"content":"trying to do like model words similarity"},{"from":3471.38,"to":3475.34,"location":2,"content":"with word vectors or we're trying to put"},{"from":3475.34,"to":3478.07,"location":2,"content":"parts of speech on words or something we"},{"from":3478.07,"to":3480.56,"location":2,"content":"can just have an intrinsic evaluation of"},{"from":3480.56,"to":3482.93,"location":2,"content":"saying how good a job did you get how"},{"from":3482.93,"to":3484.7,"location":2,"content":"you guessing the right part of speech"},{"from":3484.7,"to":3487.01,"location":2,"content":"are you putting synonyms close together"},{"from":3487.01,"to":3490.46,"location":2,"content":"and that's sort of normally very easy to"},{"from":3490.46,"to":3493.22,"location":2,"content":"do and faster compute and it's useful to"},{"from":3493.22,"to":3495.59,"location":2,"content":"do because it helps us understand the"},{"from":3495.59,"to":3498.05,"location":2,"content":"system on the other hand a lot of the"},{"from":3498.05,"to":3500.57,"location":2,"content":"time those intrinsic evaluations it's"},{"from":3500.57,"to":3503.9,"location":2,"content":"not very clear where whether having done"},{"from":3503.9,"to":3506.21,"location":2,"content":"well on that task is really going to"},{"from":3506.21,"to":3508.85,"location":2,"content":"help us build the amazing natural"},{"from":3508.85,"to":3511.01,"location":2,"content":"language understanding robots that we so"},{"from":3511.01,"to":3514.91,"location":2,"content":"ardently desire so people are also very"},{"from":3514.91,"to":3517.58,"location":2,"content":"interested in extrinsic evaluations and"},{"from":3517.58,"to":3520.28,"location":2,"content":"so extrinsic Lee is then saying well"},{"from":3520.28,"to":3523.88,"location":2,"content":"suppose you use this new stuff in a real"},{"from":3523.88,"to":3527.03,"location":2,"content":"system doesn't make performance go up"},{"from":3527.03,"to":3529.79,"location":2,"content":"and it's then sort of definitional what"},{"from":3529.79,"to":3532.13,"location":2,"content":"counts to you as a real system but"},{"from":3532.13,"to":3533.99,"location":2,"content":"normally that's meaning it's some"},{"from":3533.99,"to":3536.33,"location":2,"content":"application that human beings actually"},{"from":3536.33,"to":3539.24,"location":2,"content":"care about and like to use so that's"},{"from":3539.24,"to":3541.48,"location":2,"content":"something like web search or"},{"from":3541.48,"to":3542.63,"location":2,"content":"question-answering"},{"from":3542.63,"to":3545.45,"location":2,"content":"or a phone dialogue system or something"},{"from":3545.45,"to":3548.03,"location":2,"content":"like that that you can put it into that"},{"from":3548.03,"to":3551.87,"location":2,"content":"system and the numbers get go up so that"},{"from":3551.87,"to":3553.7,"location":2,"content":"seems what you want to do you want to"},{"from":3553.7,"to":3555.53,"location":2,"content":"have stuff that works in real tasks of"},{"from":3555.53,"to":3557.9,"location":2,"content":"course there are sort of on the other"},{"from":3557.9,"to":3559.67,"location":2,"content":"hand a lot of things are a lot harder"},{"from":3559.67,"to":3563.18,"location":2,"content":"then so it's much more work to do such"},{"from":3563.18,"to":3564.1,"location":2,"content":"an evaluate"},{"from":3564.1,"to":3566.47,"location":2,"content":"and to run different variants of a"},{"from":3566.47,"to":3570.16,"location":2,"content":"system and even when the results are"},{"from":3570.16,"to":3573.58,"location":2,"content":"poor or great sometimes it's hard to"},{"from":3573.58,"to":3574.51,"location":2,"content":"diagnose"},{"from":3574.51,"to":3576.88,"location":2,"content":"you know if your great new word vectors"},{"from":3576.88,"to":3579.04,"location":2,"content":"don't work better in the system you know"},{"from":3579.04,"to":3580.81,"location":2,"content":"it might be for sort of some extraneous"},{"from":3580.81,"to":3582.82,"location":2,"content":"reason about how the system was built"},{"from":3582.82,"to":3584.86,"location":2,"content":"it's sort of hiding all your magic and"},{"from":3584.86,"to":3586.36,"location":2,"content":"if you just change the rest of the"},{"from":3586.36,"to":3588.82,"location":2,"content":"system and suddenly show it's good"},{"from":3588.82,"to":3591.61,"location":2,"content":"effect so it's kind of hard to do sort"},{"from":3591.61,"to":3597,"location":2,"content":"of apportionment of goodness and badness"},{"from":3597,"to":3600.52,"location":2,"content":"okay so um so today I'm mainly going to"},{"from":3600.52,"to":3601.87,"location":2,"content":"say a little bit more about these"},{"from":3601.87,"to":3604.78,"location":2,"content":"intrinsic word vector evaluations that"},{"from":3604.78,"to":3607,"location":2,"content":"we've talked about so we've talked quite"},{"from":3607,"to":3610.3,"location":2,"content":"a bit about these analogies so if we're"},{"from":3610.3,"to":3612.55,"location":2,"content":"actually working out the analogies it"},{"from":3612.55,"to":3614.62,"location":2,"content":"turns out that normally what people are"},{"from":3614.62,"to":3617.56,"location":2,"content":"doing is working out a cosine distance"},{"from":3617.56,"to":3621.61,"location":2,"content":"and angle between different word"},{"from":3621.61,"to":3625.27,"location":2,"content":"candidates to work out which is the word"},{"from":3625.27,"to":3627.58,"location":2,"content":"that solves the analogy which is another"},{"from":3627.58,"to":3629.92,"location":2,"content":"little tiny wrinkle of difference there"},{"from":3629.92,"to":3632.38,"location":2,"content":"and there's also one other trick that"},{"from":3632.38,"to":3634.66,"location":2,"content":"people commonly use they forbid the"},{"from":3634.66,"to":3636.88,"location":2,"content":"system from returning one of the three"},{"from":3636.88,"to":3641.86,"location":2,"content":"words you put into the analogy okay but"},{"from":3641.86,"to":3644.05,"location":2,"content":"nevertheless so this is something that"},{"from":3644.05,"to":3645.97,"location":2,"content":"you can evaluate here are now some"},{"from":3645.97,"to":3648.91,"location":2,"content":"gloves of visualizations and so these"},{"from":3648.91,"to":3651.34,"location":2,"content":"glove visualizations show exactly the"},{"from":3651.34,"to":3654.19,"location":2,"content":"same kind of linearity property that"},{"from":3654.19,"to":3656.35,"location":2,"content":"Doug rody had discovered which means"},{"from":3656.35,"to":3658.51,"location":2,"content":"that analogies work sort of by"},{"from":3658.51,"to":3660.16,"location":2,"content":"construction because our vector space"},{"from":3660.16,"to":3662.95,"location":2,"content":"wanted to make meaning components linear"},{"from":3662.95,"to":3667.81,"location":2,"content":"so this is then showing a gender display"},{"from":3667.81,"to":3670.36,"location":2,"content":"this is showing one between companies"},{"from":3670.36,"to":3674.29,"location":2,"content":"and their CEOs kind of cool and you can"},{"from":3674.29,"to":3676.54,"location":2,"content":"also do more syntactic facts so this is"},{"from":3676.54,"to":3679.09,"location":2,"content":"showing positive comparative and"},{"from":3679.09,"to":3682.12,"location":2,"content":"superlative of adjectives yes so"},{"from":3682.12,"to":3684.19,"location":2,"content":"tomorrow Michael off came up with this"},{"from":3684.19,"to":3689.11,"location":2,"content":"idea of doing these analogy tasks and so"},{"from":3689.11,"to":3691.48,"location":2,"content":"he built a data set with a lot of"},{"from":3691.48,"to":3694.09,"location":2,"content":"analogies in it it's sort of a it's a"},{"from":3694.09,"to":3696.16,"location":2,"content":"bit of a weirdo data set because it sort"},{"from":3696.16,"to":3697.08,"location":2,"content":"of tests"},{"from":3697.08,"to":3699.12,"location":2,"content":"few random different things which may"},{"from":3699.12,"to":3700.89,"location":2,"content":"have been things that his system worked"},{"from":3700.89,"to":3704.52,"location":2,"content":"well on but you know it test countries"},{"from":3704.52,"to":3710.25,"location":2,"content":"and capitals country you know cities and"},{"from":3710.25,"to":3712.92,"location":2,"content":"states countries and currencies so"},{"from":3712.92,"to":3714.78,"location":2,"content":"they're a bunch of semantic things that"},{"from":3714.78,"to":3719.13,"location":2,"content":"tests and then there are some syntactic"},{"from":3719.13,"to":3721.76,"location":2,"content":"things that tastes so bad worst fast"},{"from":3721.76,"to":3724.98,"location":2,"content":"fastest for superlatives but you know"},{"from":3724.98,"to":3727.23,"location":2,"content":"even some of the ones are showing before"},{"from":3727.23,"to":3729.99,"location":2,"content":"you know there's no there's no Obama is"},{"from":3729.99,"to":3732.69,"location":2,"content":"too Clinton kind of ones that are"},{"from":3732.69,"to":3736.8,"location":2,"content":"actually in this evaluation set here's a"},{"from":3736.8,"to":3740.01,"location":2,"content":"big table of results that comes from our"},{"from":3740.01,"to":3742.08,"location":2,"content":"glove paper so not surprisingly the"},{"from":3742.08,"to":3744.15,"location":2,"content":"glove paper perform best in this"},{"from":3744.15,"to":3748.31,"location":2,"content":"evaluation because there was our paper"},{"from":3748.31,"to":3751.89,"location":2,"content":"but I mean perhaps you know perhaps the"},{"from":3751.89,"to":3754.89,"location":2,"content":"things to start to notice is yeah if you"},{"from":3754.89,"to":3757.86,"location":2,"content":"just do a plain SPD on counts you know"},{"from":3757.86,"to":3761.73,"location":2,"content":"that that works abominably badly for"},{"from":3761.73,"to":3765.15,"location":2,"content":"these analogy tasks but you know kind of"},{"from":3765.15,"to":3767.88,"location":2,"content":"as Doug rody showed if you start then"},{"from":3767.88,"to":3771.66,"location":2,"content":"doing manipulations of the count matrix"},{"from":3771.66,"to":3774.54,"location":2,"content":"before you do an SVD you can actually"},{"from":3774.54,"to":3777.24,"location":2,"content":"start to produce an SVD based system"},{"from":3777.24,"to":3779.85,"location":2,"content":"that actually performs quite well on"},{"from":3779.85,"to":3783.51,"location":2,"content":"these tasks you know not badly against"},{"from":3783.51,"to":3787.29,"location":2,"content":"other things other things that you will"},{"from":3787.29,"to":3789.03,"location":2,"content":"discover right at the top there are a"},{"from":3789.03,"to":3790.77,"location":2,"content":"hundred dimensional ones and at the"},{"from":3790.77,"to":3792.57,"location":2,"content":"bottom there are some thousand"},{"from":3792.57,"to":3794.16,"location":2,"content":"dimensional ones and other three hundred"},{"from":3794.16,"to":3796.08,"location":2,"content":"dimensional ones at least when you're"},{"from":3796.08,"to":3797.91,"location":2,"content":"training on a big amount of text bigger"},{"from":3797.91,"to":3800.25,"location":2,"content":"dimensionality definitely works better"},{"from":3800.25,"to":3801.78,"location":2,"content":"and I'll come back to that in a minute"},{"from":3801.78,"to":3804.93,"location":2,"content":"the amount of text makes a difference as"},{"from":3804.93,"to":3807.57,"location":2,"content":"well right so we're going up from sort"},{"from":3807.57,"to":3809.85,"location":2,"content":"of Wonder 1.5 billion words the"},{"from":3809.85,"to":3812.07,"location":2,"content":"beginning to these ones down here being"},{"from":3812.07,"to":3814.2,"location":2,"content":"trained over 42 billion words of text"},{"from":3814.2,"to":3817.14,"location":2,"content":"and perhaps unsurprisingly the 42"},{"from":3817.14,"to":3820.58,"location":2,"content":"billion words of text ones work better"},{"from":3820.58,"to":3823.86,"location":2,"content":"so it's big data here are a couple more"},{"from":3823.86,"to":3826.56,"location":2,"content":"steps from this paper so this is a graph"},{"from":3826.56,"to":3828.99,"location":2,"content":"of dimensionality and what the"},{"from":3828.99,"to":3830.5,"location":2,"content":"performance is so if"},{"from":3830.5,"to":3832.87,"location":2,"content":"the three lines the green ones semantics"},{"from":3832.87,"to":3834.94,"location":2,"content":"the blue ones the syntactic analogies"},{"from":3834.94,"to":3838.3,"location":2,"content":"and so Reds the overall score so sort of"},{"from":3838.3,"to":3840.73,"location":2,"content":"what you see is up to dimensionality"},{"from":3840.73,"to":3842.83,"location":2,"content":"three hundred things are clearly"},{"from":3842.83,"to":3845.02,"location":2,"content":"increasing quite a bit and then it gets"},{"from":3845.02,"to":3847.33,"location":2,"content":"fairly flat which is precisely why you"},{"from":3847.33,"to":3849.88,"location":2,"content":"find a lot of word vectors that are have"},{"from":3849.88,"to":3852.73,"location":2,"content":"to mention how these three hundred this"},{"from":3852.73,"to":3855.88,"location":2,"content":"one's showing what window size so this"},{"from":3855.88,"to":3857.14,"location":2,"content":"is sort of what we talked about"},{"from":3857.14,"to":3860.74,"location":2,"content":"symmetric on both sides window size and"},{"from":3860.74,"to":3864.13,"location":2,"content":"as it goes from 2 4 6 8 10 and so what"},{"from":3864.13,"to":3866.38,"location":2,"content":"you see is if you use a very small"},{"from":3866.38,"to":3869.86,"location":2,"content":"window like - that actually works that"},{"from":3869.86,"to":3873.1,"location":2,"content":"the the syntactic prediction is stronger"},{"from":3873.1,"to":3874.69,"location":2,"content":"because a lot of syntactic effects are"},{"from":3874.69,"to":3877.51,"location":2,"content":"very local whereas as you go out the"},{"from":3877.51,"to":3879.4,"location":2,"content":"semantic prediction gets better and"},{"from":3879.4,"to":3879.73,"location":2,"content":"better"},{"from":3879.73,"to":3881.59,"location":2,"content":"actually this syntactic gets a bit"},{"from":3881.59,"to":3883.27,"location":2,"content":"better as well but it's especially the"},{"from":3883.27,"to":3886.21,"location":2,"content":"semantics the gains the right graph"},{"from":3886.21,"to":3888.73,"location":2,"content":"shows that if you only use context on"},{"from":3888.73,"to":3894.13,"location":2,"content":"one side your numbers aren't as good ok"},{"from":3894.13,"to":3896.74,"location":2,"content":"so I sort of just wanted to sort of"},{"from":3896.74,"to":3900.81,"location":2,"content":"sneak in a little cameos of a couple of"},{"from":3900.81,"to":3903.85,"location":2,"content":"recent bits of work how sort of a first"},{"from":3903.85,"to":3906.19,"location":2,"content":"of what things people are doing with"},{"from":3906.19,"to":3910.6,"location":2,"content":"word vectors so this one was actually by"},{"from":3910.6,"to":3914.02,"location":2,"content":"two standard people now the best this"},{"from":3914.02,"to":3915.58,"location":2,"content":"would be the best story if I could say"},{"from":3915.58,"to":3918.55,"location":2,"content":"that this was a final project in this"},{"from":3918.55,"to":3920.32,"location":2,"content":"class last year and I for sure that's"},{"from":3920.32,"to":3921.01,"location":2,"content":"not true"},{"from":3921.01,"to":3922.72,"location":2,"content":"this paper has nothing to do with this"},{"from":3922.72,"to":3931.05,"location":2,"content":"class but in here a Z in your neuron"},{"from":3931.05,"to":3935.44,"location":2,"content":"actually heads some sort of clever and"},{"from":3935.44,"to":3938.59,"location":2,"content":"very messy ideas where they are using"},{"from":3938.59,"to":3942.37,"location":2,"content":"matrix perturbation theory and sort of"},{"from":3942.37,"to":3945.4,"location":2,"content":"showing how dimensionality and word"},{"from":3945.4,"to":3947.5,"location":2,"content":"vectors actually sort of feeds into the"},{"from":3947.5,"to":3949.6,"location":2,"content":"bias-variance tradeoff if you've seen"},{"from":3949.6,"to":3952.51,"location":2,"content":"that in other parts of machine learning"},{"from":3952.51,"to":3954.25,"location":2,"content":"and I'm not even going to attempt to"},{"from":3954.25,"to":3957.7,"location":2,"content":"explain their paper but here it is that"},{"from":3957.7,"to":3959.08,"location":2,"content":"they did really well with this paper"},{"from":3959.08,"to":3961.36,"location":2,"content":"they got all talk at Europe's from it"},{"from":3961.36,"to":3962.81,"location":2,"content":"and"},{"from":3962.81,"to":3964.07,"location":2,"content":"but there's sort of an interesting"},{"from":3964.07,"to":3966.98,"location":2,"content":"result of what you see with these word"},{"from":3966.98,"to":3969.05,"location":2,"content":"vectors which is in the way kind of"},{"from":3969.05,"to":3971.93,"location":2,"content":"surprising so this is showing doing word"},{"from":3971.93,"to":3976.1,"location":2,"content":"vector dimensions from 0 up to 10,000"},{"from":3976.1,"to":3978.08,"location":2,"content":"sorry going way higher than we talked"},{"from":3978.08,"to":3980.57,"location":2,"content":"about before and so what you discover"},{"from":3980.57,"to":3983.63,"location":2,"content":"which people are known for ages is that"},{"from":3983.63,"to":3985.16,"location":2,"content":"there's sort of a little blip that"},{"from":3985.16,"to":3987.2,"location":2,"content":"somewhere around two or three hundred"},{"from":3987.2,"to":3989.98,"location":2,"content":"which seems to optimize performance"},{"from":3989.98,"to":3993.41,"location":2,"content":"abuse those sizes but the thing that"},{"from":3993.41,"to":3995.03,"location":2,"content":"they were sort of doing a lot of their"},{"from":3995.03,"to":3996.77,"location":2,"content":"theory about and it's kind of surprising"},{"from":3996.77,"to":4000.4,"location":2,"content":"is well surely if you have a humongous"},{"from":4000.4,"to":4002.83,"location":2,"content":"humongous number likes if you're using"},{"from":4002.83,"to":4006.52,"location":2,"content":"ten thousand dimensional vectors you"},{"from":4006.52,"to":4009.1,"location":2,"content":"know you're trying to estimate another"},{"from":4009.1,"to":4012.07,"location":2,"content":"two orders of magnitude more numbers for"},{"from":4012.07,"to":4014.74,"location":2,"content":"every word surely things should just"},{"from":4014.74,"to":4017.74,"location":2,"content":"fall apart because you've got hopelessly"},{"from":4017.74,"to":4020.32,"location":2,"content":"many parameters relative to the amount"},{"from":4020.32,"to":4022.18,"location":2,"content":"of training data that you're trying to"},{"from":4022.18,"to":4024.58,"location":2,"content":"estimate these numbers from and so the"},{"from":4024.58,"to":4026.71,"location":2,"content":"interesting result that they show is"},{"from":4026.71,"to":4031.06,"location":2,"content":"that things don't fall apart and that"},{"from":4031.06,"to":4033.9,"location":2,"content":"you can essentially go out to these huge"},{"from":4033.9,"to":4036.13,"location":2,"content":"dimensionalities and the performance"},{"from":4036.13,"to":4038.44,"location":2,"content":"stays flat and that they've got a lot of"},{"from":4038.44,"to":4041.68,"location":2,"content":"theory sort of for predicting why that's"},{"from":4041.68,"to":4043.09,"location":2,"content":"actually going to end up to being the"},{"from":4043.09,"to":4048.04,"location":2,"content":"case yeah so for training these models"},{"from":4048.04,"to":4050.83,"location":2,"content":"iteratively this is quick orange is"},{"from":4050.83,"to":4054.58,"location":2,"content":"showing Glove training you know they"},{"from":4054.58,"to":4056.56,"location":2,"content":"keep on getting better for a while so"},{"from":4056.56,"to":4059.53,"location":2,"content":"you know just go out go sleep see in the"},{"from":4059.53,"to":4061.42,"location":2,"content":"morning how it's doing right so that if"},{"from":4061.42,"to":4063.82,"location":2,"content":"you're running it for 24 hours your"},{"from":4063.82,"to":4065.8,"location":2,"content":"numbers are better than if you only ran"},{"from":4065.8,"to":4068.68,"location":2,"content":"it for six hours and that's true for a"},{"from":4068.68,"to":4072.1,"location":2,"content":"lot of deep learning model sorry so this"},{"from":4072.1,"to":4074.83,"location":2,"content":"is the key reason why you don't want to"},{"from":4074.83,"to":4076.84,"location":2,"content":"start your assignment the night before"},{"from":4076.84,"to":4080.05,"location":2,"content":"it's due because even if you program it"},{"from":4080.05,"to":4082.15,"location":2,"content":"perfectly you might just not have enough"},{"from":4082.15,"to":4084.85,"location":2,"content":"time for it to run so that you produce"},{"from":4084.85,"to":4090.34,"location":2,"content":"good numbers at the end of it okay"},{"from":4090.34,"to":4097.23,"location":2,"content":"yeah so so a couple more things on that"},{"from":4097.23,"to":4101.83,"location":2,"content":"yeah so what are we showing here so"},{"from":4101.83,"to":4104.53,"location":2,"content":"these are again semantic syntactic and"},{"from":4104.53,"to":4106.99,"location":2,"content":"overall numbers so there's sort of two"},{"from":4106.99,"to":4109,"location":2,"content":"things that are so being mixed together"},{"from":4109,"to":4111.76,"location":2,"content":"here one is if we just look at the"},{"from":4111.76,"to":4114.69,"location":2,"content":"overall numbers their highest over here"},{"from":4114.69,"to":4117.91,"location":2,"content":"which is this forty two billion Common"},{"from":4117.91,"to":4120.52,"location":2,"content":"Core web pages corpus that gives us the"},{"from":4120.52,"to":4122.71,"location":2,"content":"highest overall number but there's sort"},{"from":4122.71,"to":4124.72,"location":2,"content":"of something else that interesting in"},{"from":4124.72,"to":4129.07,"location":2,"content":"this graph which is that using Wikipedia"},{"from":4129.07,"to":4132.67,"location":2,"content":"works freakily well so that you actually"},{"from":4132.67,"to":4135.43,"location":2,"content":"find that 1.6 billion tokens of"},{"from":4135.43,"to":4139.3,"location":2,"content":"Wikipedia works better than 4.3 billion"},{"from":4139.3,"to":4142.84,"location":2,"content":"tokens of newswire newspaper article"},{"from":4142.84,"to":4146.41,"location":2,"content":"data and so I think that sort of"},{"from":4146.41,"to":4149.11,"location":2,"content":"actually makes sense which is well you"},{"from":4149.11,"to":4151.54,"location":2,"content":"know the job of encyclopedias is to sort"},{"from":4151.54,"to":4153.49,"location":2,"content":"of explain concepts and how they relate"},{"from":4153.49,"to":4155.73,"location":2,"content":"to each other right so that"},{"from":4155.73,"to":4157.99,"location":2,"content":"encyclopedias are just much more exposed"},{"from":4157.99,"to":4160.81,"location":2,"content":"tree texts that show all the connections"},{"from":4160.81,"to":4163.78,"location":2,"content":"between things whereas newspapers in"},{"from":4163.78,"to":4166.69,"location":2,"content":"general aren't trying to expose it how"},{"from":4166.69,"to":4168.31,"location":2,"content":"things fit together they're just telling"},{"from":4168.31,"to":4170.38,"location":2,"content":"you about you know who got shot dead"},{"from":4170.38,"to":4172.48,"location":2,"content":"last night or something like that"},{"from":4172.48,"to":4175.69,"location":2,"content":"right so so there's this sort of"},{"from":4175.69,"to":4177.04,"location":2,"content":"interesting fact"},{"from":4177.04,"to":4179.71,"location":2,"content":"but this Wikipedia data kind of really"},{"from":4179.71,"to":4183.87,"location":2,"content":"it sort of is differentially useful for"},{"from":4183.87,"to":4187.33,"location":2,"content":"making word vectors and you know in fact"},{"from":4187.33,"to":4190.69,"location":2,"content":"you know when we did very well with our"},{"from":4190.69,"to":4193,"location":2,"content":"glove word vectors and lots of people"},{"from":4193,"to":4195.16,"location":2,"content":"use those you know I think actually one"},{"from":4195.16,"to":4197.47,"location":2,"content":"of the reasons why they work so well is"},{"from":4197.47,"to":4199.87,"location":2,"content":"that the original word to vech vech does"},{"from":4199.87,"to":4202.42,"location":2,"content":"the google distributes built only on"},{"from":4202.42,"to":4205.12,"location":2,"content":"Google News data where Al's sort of have"},{"from":4205.12,"to":4209.97,"location":2,"content":"this Wikipedia data inside them okay"},{"from":4209.97,"to":4213.22,"location":2,"content":"rushing ahead yeah so that there's all"},{"from":4213.22,"to":4215.05,"location":2,"content":"the work on analogy but the other more"},{"from":4215.05,"to":4217.75,"location":2,"content":"basic evaluation is this one of"},{"from":4217.75,"to":4220.51,"location":2,"content":"capturing similarity judgment and I"},{"from":4220.51,"to":4222.85,"location":2,"content":"haven't said much about this but you"},{"from":4222.85,"to":4224.23,"location":2,"content":"know there's this sort of"},{"from":4224.23,"to":4227.2,"location":2,"content":"of sub literature in the psychology"},{"from":4227.2,"to":4230.08,"location":2,"content":"community where people have wanted to"},{"from":4230.08,"to":4233.38,"location":2,"content":"model humans judgments of similarity so"},{"from":4233.38,"to":4236.23,"location":2,"content":"like a good psyche person what you do is"},{"from":4236.23,"to":4238.54,"location":2,"content":"you find your classroom of Psych one"},{"from":4238.54,"to":4240.94,"location":2,"content":"under grads and you show them pairs of"},{"from":4240.94,"to":4243.28,"location":2,"content":"words and say rate these things for"},{"from":4243.28,"to":4245.71,"location":2,"content":"similarity on a scale of 1 to 10 and"},{"from":4245.71,"to":4248.23,"location":2,"content":"lots of that data has been collected and"},{"from":4248.23,"to":4250.54,"location":2,"content":"you work out the mean over human beings"},{"from":4250.54,"to":4253.51,"location":2,"content":"and they give numbers like this of Tiger"},{"from":4253.51,"to":4257.38,"location":2,"content":"and cat 7.35 Tigers similar to Tiger 10"},{"from":4257.38,"to":4260.56,"location":2,"content":"book and paper plane and car stock and"},{"from":4260.56,"to":4263.56,"location":2,"content":"phone stock and CD and you get numbers"},{"from":4263.56,"to":4266.53,"location":2,"content":"so then what we're doing is wanting to"},{"from":4266.53,"to":4269.38,"location":2,"content":"say well let's use distance in the space"},{"from":4269.38,"to":4272.08,"location":2,"content":"to map directly onto these similarity"},{"from":4272.08,"to":4275.68,"location":2,"content":"judgments and how well does it map and"},{"from":4275.68,"to":4278.44,"location":2,"content":"so that sort of similarity judging has"},{"from":4278.44,"to":4281.56,"location":2,"content":"also then been used for evaluating these"},{"from":4281.56,"to":4283.84,"location":2,"content":"systems so again here are a lot of"},{"from":4283.84,"to":4285.7,"location":2,"content":"models this is again from our glove"},{"from":4285.7,"to":4287.95,"location":2,"content":"paper but so there are these various"},{"from":4287.95,"to":4290.41,"location":2,"content":"similarity data sets so one of the best"},{"from":4290.41,"to":4291.91,"location":2,"content":"known ones that I had on the slide"},{"from":4291.91,"to":4296.14,"location":2,"content":"before is this words in 3 5 3"},{"from":4296.14,"to":4300.64,"location":2,"content":"it has 353 different ones in it and so"},{"from":4300.64,"to":4303.1,"location":2,"content":"you're sort of then modeling a"},{"from":4303.1,"to":4305.56,"location":2,"content":"correlation between your judgments of"},{"from":4305.56,"to":4307.66,"location":2,"content":"similarity and the ones that came from"},{"from":4307.66,"to":4311.2,"location":2,"content":"the human beings ok two more things I"},{"from":4311.2,"to":4314.5,"location":2,"content":"want to say yeah so we had that problem"},{"from":4314.5,"to":4318.55,"location":2,"content":"right at the beginning of Clinton and"},{"from":4318.55,"to":4321.67,"location":2,"content":"how that could be various people and"},{"from":4321.67,"to":4323.86,"location":2,"content":"that's perhaps in some sense the"},{"from":4323.86,"to":4326.59,"location":2,"content":"simplest case of words being ambiguous"},{"from":4326.59,"to":4328.78,"location":2,"content":"when you have names which have reference"},{"from":4328.78,"to":4331.87,"location":2,"content":"to different people but it's not only"},{"from":4331.87,"to":4336.43,"location":2,"content":"true of names so by and large words in"},{"from":4336.43,"to":4340.39,"location":2,"content":"human languages ambiguous and have lots"},{"from":4340.39,"to":4343.54,"location":2,"content":"of meanings that's especially true of"},{"from":4343.54,"to":4345.82,"location":2,"content":"common words they always have lots of"},{"from":4345.82,"to":4348.52,"location":2,"content":"meaning it's especially true of words"},{"from":4348.52,"to":4350.89,"location":2,"content":"that have existed for a long time it's"},{"from":4350.89,"to":4353.23,"location":2,"content":"not true of new very technical words you"},{"from":4353.23,"to":4355.3,"location":2,"content":"know carcinoma I think that only has one"},{"from":4355.3,"to":4357.9,"location":2,"content":"meaning but you know if you think"},{"from":4357.9,"to":4363.35,"location":2,"content":"of any relatively common word and starts"},{"from":4363.35,"to":4366.15,"location":2,"content":"scratching your head for a moment you'll"},{"from":4366.15,"to":4369.12,"location":2,"content":"find it has lots of meanings I maybe"},{"from":4369.12,"to":4370.8,"location":2,"content":"this isn't even such a common word but"},{"from":4370.8,"to":4372.89,"location":2,"content":"my random word I've got here is pike"},{"from":4372.89,"to":4375.51,"location":2,"content":"pike has lots of meanings it has"},{"from":4375.51,"to":4380.34,"location":2,"content":"meanings like a fish it's a kind of fish"},{"from":4380.34,"to":4381.93,"location":2,"content":"yeah so there's a fish that's a pike"},{"from":4381.93,"to":4386.64,"location":2,"content":"what else is a pike a large spear yes a"},{"from":4386.64,"to":4389.04,"location":2,"content":"large spear as a pike other kinds of"},{"from":4389.04,"to":4393.03,"location":2,"content":"Pike's gymnastics move or in diving move"},{"from":4393.03,"to":4397.2,"location":2,"content":"it's a road yeah so there are lots of"},{"from":4397.2,"to":4399.72,"location":2,"content":"meanings there are other meanings"},{"from":4399.72,"to":4401.88,"location":2,"content":"I mean Australian English Pike has also"},{"from":4401.88,"to":4404.61,"location":2,"content":"used as a verb to mean to pull out of"},{"from":4404.61,"to":4407.58,"location":2,"content":"doing something like we were all going"},{"from":4407.58,"to":4410.85,"location":2,"content":"to go out to a nightclub later but Joe"},{"from":4410.85,"to":4414.48,"location":2,"content":"piked I don't think that usage is common"},{"from":4414.48,"to":4416.34,"location":2,"content":"in this country but um you can try it"},{"from":4416.34,"to":4421.02,"location":2,"content":"out right but lots of meanings and you"},{"from":4421.02,"to":4423.69,"location":2,"content":"know this isn't only true of the word"},{"from":4423.69,"to":4426.63,"location":2,"content":"Pike you might pick any other simple"},{"from":4426.63,"to":4428.46,"location":2,"content":"word right you can pick a word like"},{"from":4428.46,"to":4432.39,"location":2,"content":"shale or field or house or make you know"},{"from":4432.39,"to":4434.25,"location":2,"content":"they have lots of meanings when it comes"},{"from":4434.25,"to":4436.98,"location":2,"content":"down to it so you know but so how can"},{"from":4436.98,"to":4439.35,"location":2,"content":"this work if we just have one meaning"},{"from":4439.35,"to":4442.44,"location":2,"content":"for words and that's an interesting"},{"from":4442.44,"to":4445.5,"location":2,"content":"question and I was something that we"},{"from":4445.5,"to":4448.4,"location":2,"content":"were actually interested in early on so"},{"from":4448.4,"to":4451.74,"location":2,"content":"um even before the word to vector K mout"},{"from":4451.74,"to":4456.03,"location":2,"content":"back in 2012 we were playing around with"},{"from":4456.03,"to":4460.56,"location":2,"content":"neural word vectors and we thought boy"},{"from":4460.56,"to":4464.7,"location":2,"content":"this is so broken having only one sense"},{"from":4464.7,"to":4467.88,"location":2,"content":"for a word why don't we come up with the"},{"from":4467.88,"to":4469.74,"location":2,"content":"model that has multiple sensors for a"},{"from":4469.74,"to":4472.35,"location":2,"content":"word and so we did that and we did it in"},{"from":4472.35,"to":4475.47,"location":2,"content":"a pretty crude way I guess the way we"},{"from":4475.47,"to":4479.28,"location":2,"content":"did it is say well let's for each common"},{"from":4479.28,"to":4482.7,"location":2,"content":"word let's cluster all the context in"},{"from":4482.7,"to":4486.18,"location":2,"content":"which it occurs and then we'll see if"},{"from":4486.18,"to":4489.06,"location":2,"content":"there seem to be multiple clear clusters"},{"from":4489.06,"to":4491.52,"location":2,"content":"by some criterion for that"},{"from":4491.52,"to":4494.76,"location":2,"content":"and if so we'll just sort of split the"},{"from":4494.76,"to":4496.89,"location":2,"content":"word into pseudo words so if it seems"},{"from":4496.89,"to":4500.13,"location":2,"content":"like that there are five clusters for"},{"from":4500.13,"to":4502.08,"location":2,"content":"the word the example I'm it to use here"},{"from":4502.08,"to":4504.96,"location":2,"content":"is Jaguar five clusters for the word JQ"},{"from":4504.96,"to":4507.21,"location":2,"content":"Oh we'll just call them Jaguar 1 j qo 2"},{"from":4507.21,"to":4510.39,"location":2,"content":"ju a 3 4 5 so we just literally change"},{"from":4510.39,"to":4512.94,"location":2,"content":"the word in our corpus according to us"},{"from":4512.94,"to":4515.01,"location":2,"content":"cluster number and then we run our word"},{"from":4515.01,"to":4517.02,"location":2,"content":"vectoring algorithm and so we get a"},{"from":4517.02,"to":4519.9,"location":2,"content":"representation for each of those senses"},{"from":4519.9,"to":4520.83,"location":2,"content":"of the word"},{"from":4520.83,"to":4522.96,"location":2,"content":"and basically that works right up the"},{"from":4522.96,"to":4525.6,"location":2,"content":"top is Jaguar 1 next luxury and"},{"from":4525.6,"to":4530.19,"location":2,"content":"convertible here is I guess there's a"},{"from":4530.19,"to":4532.56,"location":2,"content":"very old version and Mac OS called"},{"from":4532.56,"to":4534.12,"location":2,"content":"Jaguar and you remember and remember"},{"from":4534.12,"to":4535.23,"location":2,"content":"that one"},{"from":4535.23,"to":4537.6,"location":2,"content":"right so it's Jaguar is right next to"},{"from":4537.6,"to":4539.55,"location":2,"content":"software and Microsoft up there so"},{"from":4539.55,"to":4542.31,"location":2,"content":"that's hopeful he is the Jaguar that's"},{"from":4542.31,"to":4546.03,"location":2,"content":"right next to the hunter and I'm a bit"},{"from":4546.03,"to":4548.01,"location":2,"content":"confused on this one this Jaguars near"},{"from":4548.01,"to":4550.65,"location":2,"content":"solo musical keyboard and string is"},{"from":4550.65,"to":4552.93,"location":2,"content":"there a band a brand of keyboard called"},{"from":4552.93,"to":4555.03,"location":2,"content":"Jack I'm not quite sure about that one"},{"from":4555.03,"to":4557.78,"location":2,"content":"but anyway it sort of basically works"},{"from":4557.78,"to":4560.82,"location":2,"content":"but that was sort of crude and it's also"},{"from":4560.82,"to":4562.8,"location":2,"content":"perhaps problematic it's a lot of time"},{"from":4562.8,"to":4565.41,"location":2,"content":"the divisions between senses aren't very"},{"from":4565.41,"to":4567.75,"location":2,"content":"clear right a lot of senses are actually"},{"from":4567.75,"to":4569.79,"location":2,"content":"related to each other and overlapping"},{"from":4569.79,"to":4572.01,"location":2,"content":"because when how senses normally arrive"},{"from":4572.01,"to":4574.2,"location":2,"content":"is that people stretch the meanings of"},{"from":4574.2,"to":4575.85,"location":2,"content":"words it's not that they just sort of"},{"from":4575.85,"to":4578.19,"location":2,"content":"randomly wake up the next morning and"},{"from":4578.19,"to":4580.89,"location":2,"content":"say I know carpet I could also refer to"},{"from":4580.89,"to":4584.76,"location":2,"content":"that as stone and give a new sense to"},{"from":4584.76,"to":4586.56,"location":2,"content":"the word stone right you sort of take"},{"from":4586.56,"to":4588.96,"location":2,"content":"something that you know about like a web"},{"from":4588.96,"to":4591.18,"location":2,"content":"and you extend it metaphorically to"},{"from":4591.18,"to":4595.17,"location":2,"content":"other uses of webbing so here's a"},{"from":4595.17,"to":4597.3,"location":2,"content":"perhaps more interesting thing so this"},{"from":4597.3,"to":4600.66,"location":2,"content":"is the other Sanjeev Arora paper that I"},{"from":4600.66,"to":4603,"location":2,"content":"was going to mention so that what"},{"from":4603,"to":4607.47,"location":2,"content":"happens if you don't if you don't have"},{"from":4607.47,"to":4610.35,"location":2,"content":"more than one sense for each word well"},{"from":4610.35,"to":4613.23,"location":2,"content":"effectively what you get is that the"},{"from":4613.23,"to":4616.02,"location":2,"content":"word vector that you learn is what's"},{"from":4616.02,"to":4618.63,"location":2,"content":"referred to by physicists and fancy"},{"from":4618.63,"to":4622.41,"location":2,"content":"people as a superposition of the word"},{"from":4622.41,"to":4624.63,"location":2,"content":"vectors of the different sentences"},{"from":4624.63,"to":4627.57,"location":2,"content":"different sensors play super super"},{"from":4627.57,"to":4632.69,"location":2,"content":"position just means a weighted average"},{"from":4632.69,"to":4636.12,"location":2,"content":"so that effectively my meaning of Pyke"},{"from":4636.12,"to":4638.43,"location":2,"content":"is sort of a weighted average of the"},{"from":4638.43,"to":4640.47,"location":2,"content":"vectors for the different senses of Pyke"},{"from":4640.47,"to":4643.32,"location":2,"content":"and the components are just weighted by"},{"from":4643.32,"to":4646.41,"location":2,"content":"their frequency so that part maybe is"},{"from":4646.41,"to":4648.9,"location":2,"content":"perhaps not too surprising but the part"},{"from":4648.9,"to":4651.96,"location":2,"content":"that's really surprising is well if"},{"from":4651.96,"to":4653.97,"location":2,"content":"we're just averaging these word vectors"},{"from":4653.97,"to":4657.3,"location":2,"content":"you think you couldn't get anything out"},{"from":4657.3,"to":4659.43,"location":2,"content":"of the average right like if I tell you"},{"from":4659.43,"to":4661.89,"location":2,"content":"I'm thinking of two numbers and they're"},{"from":4661.89,"to":4663.18,"location":2,"content":"you know"},{"from":4663.18,"to":4666.48,"location":2,"content":"weighted sum is 54 what are my two"},{"from":4666.48,"to":4668.52,"location":2,"content":"numbers right you know sort of really"},{"from":4668.52,"to":4670.5,"location":2,"content":"short of information to be able to"},{"from":4670.5,"to":4673.8,"location":2,"content":"answer my question but well you know for"},{"from":4673.8,"to":4678.36,"location":2,"content":"these word vectors we have these high"},{"from":4678.36,"to":4682.41,"location":2,"content":"dimensional spaces and even though there"},{"from":4682.41,"to":4685.92,"location":2,"content":"are a lot of words the space is so vast"},{"from":4685.92,"to":4688.62,"location":2,"content":"for thoughts dimensions that actual"},{"from":4688.62,"to":4692.52,"location":2,"content":"words or sensors are very sparse in that"},{"from":4692.52,"to":4695.25,"location":2,"content":"space and so it turns out there's this"},{"from":4695.25,"to":4698.46,"location":2,"content":"whole literature on sparse coding"},{"from":4698.46,"to":4701.37,"location":2,"content":"compressed sensing some of which is"},{"from":4701.37,"to":4702.6,"location":2,"content":"actually done by people in the stats"},{"from":4702.6,"to":4706.26,"location":2,"content":"department here which shows that in"},{"from":4706.26,"to":4708.21,"location":2,"content":"these cases where you have these sort of"},{"from":4708.21,"to":4711.27,"location":2,"content":"sparse codes in these high dimensional"},{"from":4711.27,"to":4713.34,"location":2,"content":"spaces you can actually commonly"},{"from":4713.34,"to":4715.65,"location":2,"content":"reconstruct out the components of a"},{"from":4715.65,"to":4717.87,"location":2,"content":"superposition even though all you've"},{"from":4717.87,"to":4719.46,"location":2,"content":"done has sort of done this weighted"},{"from":4719.46,"to":4722.49,"location":2,"content":"average and so this paper looks at how"},{"from":4722.49,"to":4725.55,"location":2,"content":"you can do this and so they have these"},{"from":4725.55,"to":4728.49,"location":2,"content":"underlying meaning components and they"},{"from":4728.49,"to":4731.61,"location":2,"content":"sort of separated out so ty has one"},{"from":4731.61,"to":4733.92,"location":2,"content":"meaning component as in the space of"},{"from":4733.92,"to":4736.44,"location":2,"content":"trousers blouse waist code that makes"},{"from":4736.44,"to":4738.48,"location":2,"content":"sense another one in this meaning"},{"from":4738.48,"to":4740.85,"location":2,"content":"component of season teams winning league"},{"from":4740.85,"to":4744.06,"location":2,"content":"makes sense score line goal has"},{"from":4744.06,"to":4746.37,"location":2,"content":"equalizer clinching schoolís this one"},{"from":4746.37,"to":4748.95,"location":2,"content":"seems to overlap with this one a bit but"},{"from":4748.95,"to":4750.06,"location":2,"content":"here ty"},{"from":4750.06,"to":4752.25,"location":2,"content":"this sort of cable ties and wire ties"},{"from":4752.25,"to":4754.26,"location":2,"content":"and things like that so they're actually"},{"from":4754.26,"to":4756.18,"location":2,"content":"able to pull out the different sense"},{"from":4756.18,"to":4757.54,"location":2,"content":"meanings"},{"from":4757.54,"to":4760.09,"location":2,"content":"from outside out of the meaning of the"},{"from":4760.09,"to":4764.02,"location":2,"content":"word so that is a kind of a cool thing I"},{"from":4764.02,"to":4769.39,"location":2,"content":"just want to say one more thing okay all"},{"from":4769.39,"to":4772.72,"location":2,"content":"the evaluations so far was intrinsic you"},{"from":4772.72,"to":4774.82,"location":2,"content":"also might want to do extrinsic"},{"from":4774.82,"to":4778.03,"location":2,"content":"evaluation why why word vectors excited"},{"from":4778.03,"to":4780.79,"location":2,"content":"people and NLP so much is it turned out"},{"from":4780.79,"to":4782.95,"location":2,"content":"that having this meaning having this"},{"from":4782.95,"to":4785.23,"location":2,"content":"representation of meaning just turned"},{"from":4785.23,"to":4787.48,"location":2,"content":"out to be very useful and sort of"},{"from":4787.48,"to":4790.48,"location":2,"content":"improve all of your tasks after that and"},{"from":4790.48,"to":4793.87,"location":2,"content":"so this is doing named entity"},{"from":4793.87,"to":4795.94,"location":2,"content":"recognition which is labeling persons"},{"from":4795.94,"to":4798.79,"location":2,"content":"and locations and organizations but you"},{"from":4798.79,"to":4800.68,"location":2,"content":"know it's typical of many tasks of what"},{"from":4800.68,"to":4802.93,"location":2,"content":"people found was if you started with a"},{"from":4802.93,"to":4805.06,"location":2,"content":"model without sort of word"},{"from":4805.06,"to":4807.52,"location":2,"content":"representations and you throw in your"},{"from":4807.52,"to":4810.13,"location":2,"content":"word vectors regardless of whether their"},{"from":4810.13,"to":4812.53,"location":2,"content":"word to vehicle glove ones just kind of"},{"from":4812.53,"to":4814.69,"location":2,"content":"your numbers go up a couple of percent"},{"from":4814.69,"to":4817.03,"location":2,"content":"or more and so the word vectors were"},{"from":4817.03,"to":4819.43,"location":2,"content":"just sort of this useful source that you"},{"from":4819.43,"to":4821.89,"location":2,"content":"could throw into any NLP system that you"},{"from":4821.89,"to":4824.53,"location":2,"content":"built and your numbers went up so that"},{"from":4824.53,"to":4825.73,"location":2,"content":"there are just a very effective"},{"from":4825.73,"to":4829,"location":2,"content":"technology which actually did work and"},{"from":4829,"to":4831.16,"location":2,"content":"basically any extrinsic tasks you type"},{"from":4831.16,"to":4835.29,"location":2,"content":"tried it on okay thanks a lot"}]} \ No newline at end of file diff --git a/bcc-en/20.bcc b/bcc-en/20.bcc new file mode 100644 index 0000000000000000000000000000000000000000..e8b72045f3b0236b7967f6865dec6aefb95d54b5 --- /dev/null +++ b/bcc-en/20.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.76,"to":9.57,"location":2,"content":"Let's get started. So welcome to the very final lecture of the class."},{"from":9.57,"to":11.47,"location":2,"content":"I hope you're all surviving the last week and,"},{"from":11.47,"to":13.83,"location":2,"content":"uh, wrapping up your projects."},{"from":13.83,"to":18.54,"location":2,"content":"So today we're going to be hearing about the future of NLP and deep learning."},{"from":18.54,"to":22.44,"location":2,"content":"Uh, so Chris is still traveling and today we're going to be having Kevin Clark,"},{"from":22.44,"to":24.82,"location":2,"content":"who's one of the PhD students in the lab, uh,"},{"from":24.82,"to":26.58,"location":2,"content":"in the NLP lab,"},{"from":26.58,"to":29.61,"location":2,"content":"and he was also one of the head TAs for the class last year."},{"from":29.61,"to":31.79,"location":2,"content":"So he's very familiar with the class as a whole."},{"from":31.79,"to":33.77,"location":2,"content":"Um, so, take it away Kevin."},{"from":33.77,"to":37.83,"location":2,"content":"Okay. Thanks, Abby. Um, yeah,"},{"from":37.83,"to":40.44,"location":2,"content":"it's great to be back after being a TA last year."},{"from":40.44,"to":45.35,"location":2,"content":"Um, I'm really excited today to be talking about the future of deep learning and NLP."},{"from":45.35,"to":49.09,"location":2,"content":"Um, obviously, trying to forecast the future, um,"},{"from":49.09,"to":51.8,"location":2,"content":"for deep learning or anything in that space is really"},{"from":51.8,"to":54.8,"location":2,"content":"difficult because the field is changing super quickly."},{"from":54.8,"to":57.08,"location":2,"content":"Um, so as one reference point, um,"},{"from":57.08,"to":60.05,"location":2,"content":"let's look at what did deep learning for NLP,"},{"from":60.05,"to":62.29,"location":2,"content":"um, look like about five years ago."},{"from":62.29,"to":68.3,"location":2,"content":"And really, a lot of ideas that are now considered to be pretty core techniques,"},{"from":68.3,"to":70.44,"location":2,"content":"um, when we think of deep learning and NLP,"},{"from":70.44,"to":72.17,"location":2,"content":"um, didn't even exist back then."},{"from":72.17,"to":74.87,"location":2,"content":"Um, so things you learned in this class like Seq2Seq,"},{"from":74.87,"to":77.18,"location":2,"content":"attention mechanism, um, large-scale,"},{"from":77.18,"to":80.11,"location":2,"content":"reading comprehension, uh, even frameworks"},{"from":80.11,"to":83.3,"location":2,"content":"such as TensorFlow or Pytorch, um, didn't exist."},{"from":83.3,"to":87.14,"location":2,"content":"And, uh, the point I want to make with this is that, um,"},{"from":87.14,"to":91.2,"location":2,"content":"because of this it's really difficult to, to look into the future and say,"},{"from":91.2,"to":93.67,"location":2,"content":"okay, what are things going to be like?"},{"from":93.67,"to":98.06,"location":2,"content":"Um, what I think we can do though is look at, um,"},{"from":98.06,"to":101.87,"location":2,"content":"areas that right now are really sort of taking off, um,"},{"from":101.87,"to":103.64,"location":2,"content":"so areas in which, um,"},{"from":103.64,"to":106.37,"location":2,"content":"there's a lot, been a lot of recent success and kind of, uh,"},{"from":106.37,"to":108.1,"location":2,"content":"project from that, that,"},{"from":108.1,"to":110.9,"location":2,"content":"those same areas will likely be important in the future."},{"from":110.9,"to":115.82,"location":2,"content":"Um, and in this talk I'm going to be mostly focusing on one key idea of"},{"from":115.82,"to":118.97,"location":2,"content":"wh- key idea which is the idea of leveraging"},{"from":118.97,"to":122.92,"location":2,"content":"unlabeled examples when training our NLP systems."},{"from":122.92,"to":127.49,"location":2,"content":"So I'll be talking a bit about doing that for machine translation, um,"},{"from":127.49,"to":130.82,"location":2,"content":"both in improving the quality of translation and even"},{"from":130.82,"to":134.31,"location":2,"content":"in doing a translation in an unsupervised way."},{"from":134.31,"to":136.17,"location":2,"content":"So that means you don't have, um,"},{"from":136.17,"to":139.23,"location":2,"content":"paired sentences, uh, with, with their translations."},{"from":139.23,"to":143.37,"location":2,"content":"Um, you try to learn a translation model only from a monolingual corpus."},{"from":143.37,"to":147.12,"location":2,"content":"Um, the second thing I'll be talking a little bit about is, uh,"},{"from":147.12,"to":149.33,"location":2,"content":"OpenAI's GPT-2, um,"},{"from":149.33,"to":152.44,"location":2,"content":"and in general this phenomenon of really scaling up,"},{"from":152.44,"to":154.04,"location":2,"content":"um, deep learning models."},{"from":154.04,"to":158.33,"location":2,"content":"Um, I know you saw a little bit of this in the lecture on contextual representations,"},{"from":158.33,"to":160.34,"location":2,"content":"but this, but this will be a little bit more in depth."},{"from":160.34,"to":162.34,"location":2,"content":"Um, and I think, um,"},{"from":162.34,"to":166.66,"location":2,"content":"these new developments in NLP have had some,"},{"from":166.66,"to":168.6,"location":2,"content":"um, pretty big, uh,"},{"from":168.6,"to":170.59,"location":2,"content":"impacts in terms of,"},{"from":170.59,"to":173.75,"location":2,"content":"uh, more broadly kind of beyond even the technology we're using,"},{"from":173.75,"to":175.07,"location":2,"content":"and in particular, I mean,"},{"from":175.07,"to":180.56,"location":2,"content":"starting to raise more and more concerns about the social impact of NLP, um,"},{"from":180.56,"to":183.52,"location":2,"content":"both, um, in what our models can do and also in kind"},{"from":183.52,"to":186.59,"location":2,"content":"of plans of what, where people are looking to apply these models, um,"},{"from":186.59,"to":189.75,"location":2,"content":"and I think that really has some risks associated with it, um,"},{"from":189.75,"to":193.16,"location":2,"content":"in terms of security also in terms of areas like bias."},{"from":193.16,"to":196.47,"location":2,"content":"Um, I'm also gonna talk a bit about future areas of research,"},{"from":196.47,"to":199.14,"location":2,"content":"um, these are mostly research areas now that are, um,"},{"from":199.14,"to":202.06,"location":2,"content":"over the past year have really kind of developed into"},{"from":202.06,"to":207.19,"location":2,"content":"promising areas and I expect they will continue to be important in the future."},{"from":207.19,"to":209.7,"location":2,"content":"Okay, um, to start with,"},{"from":209.7,"to":213.31,"location":2,"content":"I wanna ask this question, why has deep learning been so successful recently?"},{"from":213.31,"to":215.51,"location":2,"content":"Um, I like this comic, um,"},{"from":215.51,"to":218.04,"location":2,"content":"here there's a statistical learning person,"},{"from":218.04,"to":221.03,"location":2,"content":"um, and they've got some really complicated,"},{"from":221.03,"to":224.01,"location":2,"content":"um, well-motivated, uh, method for doing, um,"},{"from":224.01,"to":225.51,"location":2,"content":"the task they care about,"},{"from":225.51,"to":227.46,"location":2,"content":"and then the neural net person just says,"},{"from":227.46,"to":229.11,"location":2,"content":"er, stack more layers."},{"from":229.11,"to":232.02,"location":2,"content":"Um, so, so the point I want to make here is that, um,"},{"from":232.02,"to":236.07,"location":2,"content":"deep learning has not been successful recently because it's more"},{"from":236.07,"to":241.53,"location":2,"content":"theoretically motivated or it's more sophisticated than previous techniques, um."},{"from":241.53,"to":244.25,"location":2,"content":"In fact I would say that actually a lot of, um,"},{"from":244.25,"to":246.63,"location":2,"content":"older statistical methods have more of"},{"from":246.63,"to":250.18,"location":2,"content":"a theoretical underpinning than some of the tricks we do in deep learning."},{"from":250.18,"to":254.05,"location":2,"content":"Um, really the thing that makes deep learning so"},{"from":254.05,"to":257.66,"location":2,"content":"successful in recent years has been its ability to scale, right."},{"from":257.66,"to":262.31,"location":2,"content":"So neural nets, as we increase the size of the data,"},{"from":262.31,"to":264.26,"location":2,"content":"as we increase the size of the models, um,"},{"from":264.26,"to":266.17,"location":2,"content":"they get a really big boost in accuracy,"},{"from":266.17,"to":268.56,"location":2,"content":"in ways other approaches do not."},{"from":268.56,"to":272.21,"location":2,"content":"And, um, if you look to the '80s and '90s, um,"},{"from":272.21,"to":276.19,"location":2,"content":"there was actually plenty of research in neural nets going on, um."},{"from":276.19,"to":279.13,"location":2,"content":"But it hadn't, doesn't have a hype around it that it does"},{"from":279.13,"to":282.2,"location":2,"content":"now and that seems likely to be because,"},{"from":282.2,"to":285.02,"location":2,"content":"um, in the past there wasn't, um,"},{"from":285.02,"to":287.18,"location":2,"content":"the same resources in terms of computers,"},{"from":287.18,"to":289.24,"location":2,"content":"in terms of data and, um,"},{"from":289.24,"to":293.12,"location":2,"content":"only now after we've reached sort of an inflection point where we can"},{"from":293.12,"to":295.22,"location":2,"content":"really take advantage of scale in"},{"from":295.22,"to":297.96,"location":2,"content":"our deep learning models and we started to see it become,"},{"from":297.96,"to":301.52,"location":2,"content":"um, a really successful paradigm for machine learning."},{"from":301.52,"to":304.08,"location":2,"content":"Um, if we look at big, uh,"},{"from":304.08,"to":306.07,"location":2,"content":"deep learning success stories, um,"},{"from":306.07,"to":310.2,"location":2,"content":"I think, uh, you can see kind of this idea play out, right?"},{"from":310.2,"to":316.49,"location":2,"content":"So here are three of what are arguably the most famous successes of deep learning, right."},{"from":316.49,"to":318.62,"location":2,"content":"So there's image recognition, where before,"},{"from":318.62,"to":320.87,"location":2,"content":"people used very highly engineered, um,"},{"from":320.87,"to":325.87,"location":2,"content":"features to classify images and now neural nets are much superior, um, to those methods."},{"from":325.87,"to":329.79,"location":2,"content":"Um, machine translation has really closed the gap between, um,"},{"from":329.79,"to":333.02,"location":2,"content":"phrase-based systems and human quality translation,"},{"from":333.02,"to":335.73,"location":2,"content":"so this is widely used in things like Google Translate"},{"from":335.73,"to":339.12,"location":2,"content":"and the quality has actually gotten a lot better over the past five years."},{"from":339.12,"to":343.55,"location":2,"content":"Um, another example that had a lot of hype around it is game-playing, so, um,"},{"from":343.55,"to":346.46,"location":2,"content":"there's been work on Atari games, there's been AlphaGo,"},{"from":346.46,"to":350.39,"location":2,"content":"uh, more recently there's been AlphaStar and OpenAI Five."},{"from":350.39,"to":353.6,"location":2,"content":"Um, if you look at all three of these cases underlying"},{"from":353.6,"to":357.2,"location":2,"content":"these successes is really large amounts of data, right."},{"from":357.2,"to":358.55,"location":2,"content":"So for ImageNet, um,"},{"from":358.55,"to":360.02,"location":2,"content":"for image recognition, um,"},{"from":360.02,"to":363.04,"location":2,"content":"there is the ImageNet dataset which has 14 million images,"},{"from":363.04,"to":366.32,"location":2,"content":"uh, machine translation datasets often have millions of examples."},{"from":366.32,"to":369.27,"location":2,"content":"Um, for game playing you can actually"},{"from":369.27,"to":372.47,"location":2,"content":"generate as much training data as you want essentially,"},{"from":372.47,"to":374.69,"location":2,"content":"um, just by running your agent,"},{"from":374.69,"to":376.04,"location":2,"content":"um, within the game,"},{"from":376.04,"to":379.12,"location":2,"content":"um, over and over again."},{"from":379.12,"to":381.36,"location":2,"content":"Um, so if we,"},{"from":381.36,"to":383.59,"location":2,"content":"if we look to NLP, um,"},{"from":383.59,"to":387.74,"location":2,"content":"the story is quite a bit different for a lot of tasks, um, right."},{"from":387.74,"to":392.03,"location":2,"content":"So if you look at even pretty core kind of popular tasks,"},{"from":392.03,"to":395.06,"location":2,"content":"to say, reading comprehension in English, um,"},{"from":395.06,"to":399.71,"location":2,"content":"datasets like SQuAD are in the order of like 100,000 examples"},{"from":399.71,"to":404.81,"location":2,"content":"which is considerably less than the millions or tens of millions of examples,"},{"from":404.81,"to":407.11,"location":2,"content":"um, that these previous,"},{"from":407.11,"to":410.29,"location":2,"content":"um, successes have, have benefited from."},{"from":410.29,"to":414.21,"location":2,"content":"Um, and that's of course only for English, right."},{"from":414.21,"to":415.77,"location":2,"content":"Um, there are, um,"},{"from":415.77,"to":419.57,"location":2,"content":"thousands of other languages and this is I think"},{"from":419.57,"to":423.77,"location":2,"content":"a problem with NLP data as it exists today."},{"from":423.77,"to":426.45,"location":2,"content":"Um, the vast majority of data is in English, um,"},{"from":426.45,"to":430.07,"location":2,"content":"when in reality fewer than 10% of the world's population,"},{"from":430.07,"to":432.19,"location":2,"content":"um, speak English as their first language."},{"from":432.19,"to":437.56,"location":2,"content":"Um, so these problems with small datasets are only compounded if you look at,"},{"from":437.56,"to":441.46,"location":2,"content":"um, the full spectrum of languages, um, that exist."},{"from":441.46,"to":443.95,"location":2,"content":"Um, so, as what do we do,"},{"from":443.95,"to":445.8,"location":2,"content":"uh, when we're limited by this data,"},{"from":445.8,"to":450.56,"location":2,"content":"but we want to take advantage of deep learning scale and train the biggest models we can."},{"from":450.56,"to":452.51,"location":2,"content":"Um, the popular solution, um,"},{"from":452.51,"to":456.23,"location":2,"content":"that's especially had recent success is using unlabeled data, um,"},{"from":456.23,"to":457.81,"location":2,"content":"because unlike labeled data,"},{"from":457.81,"to":460.84,"location":2,"content":"unlabeled data is very easy to acquire for language."},{"from":460.84,"to":462.12,"location":2,"content":"Um, you can just go to the Internet,"},{"from":462.12,"to":464.69,"location":2,"content":"you can go to books, you can get lots of text, um,"},{"from":464.69,"to":469.37,"location":2,"content":"whereas labeled data usually requires at the least crowdsourcing examples."},{"from":469.37,"to":474.73,"location":2,"content":"Um, in some cases you even require someone who's an expert in something like linguistics,"},{"from":474.73,"to":479.51,"location":2,"content":"um, to, to annotate that data."},{"from":479.51,"to":483.89,"location":2,"content":"Okay, so, um, this first part of the talk is going to be applying"},{"from":483.89,"to":488.19,"location":2,"content":"this idea of leveraging unlabeled data to improve our NLP models,"},{"from":488.19,"to":491.99,"location":2,"content":"um, to the task of machine translation."},{"from":491.99,"to":495.17,"location":2,"content":"Um, so let's talk about machine translation data."},{"from":495.17,"to":500.52,"location":2,"content":"Um, it is true that there do exist quite large datasets for machine translation."},{"from":500.52,"to":503.17,"location":2,"content":"Um, those datasets don't exist because"},{"from":503.17,"to":506.87,"location":2,"content":"NLP researchers have annotated texts for the purpose of training their models, right."},{"from":506.87,"to":509.75,"location":2,"content":"They exist because, er, in various settings,"},{"from":509.75,"to":513.2,"location":2,"content":"translation is done just because it's useful, so for example,"},{"from":513.2,"to":515.07,"location":2,"content":"proceedings of the European Parliament,"},{"from":515.07,"to":517.02,"location":2,"content":"um, proceedings of the United Nations,"},{"from":517.02,"to":521.32,"location":2,"content":"um, some, uh, news sites, they translate their articles into many languages."},{"from":521.32,"to":526.61,"location":2,"content":"Um, so really, the machine translation data we use to train our models are often"},{"from":526.61,"to":532.75,"location":2,"content":"more of byproducts of existing cases where translation is wanted rather than,"},{"from":532.75,"to":537.5,"location":2,"content":"um, kind of a full sampling of the sort of text we see in the world."},{"from":537.5,"to":538.91,"location":2,"content":"Um, so that means number one,"},{"from":538.91,"to":540.68,"location":2,"content":"it's quite limited in domain, right."},{"from":540.68,"to":543.58,"location":2,"content":"So it's not easy to find translated tweets,"},{"from":543.58,"to":545.41,"location":2,"content":"um, unless you happen to work for Twitter."},{"from":545.41,"to":548.14,"location":2,"content":"Um, in addition to that, um,"},{"from":548.14,"to":552.23,"location":2,"content":"there's limitations in terms of the languages that are covered, right."},{"from":552.23,"to":554.75,"location":2,"content":"So some languages, say European languages,"},{"from":554.75,"to":556.5,"location":2,"content":"there's a lot of translation data, um,"},{"from":556.5,"to":559.18,"location":2,"content":"for other languages there's much less."},{"from":559.18,"to":562.04,"location":2,"content":"Um, so in these settings where we want to work on"},{"from":562.04,"to":565.22,"location":2,"content":"a different domain or where we want to work with a low resource language,"},{"from":565.22,"to":568,"location":2,"content":"um, we're limited by labeled data, um,"},{"from":568,"to":570.99,"location":2,"content":"but what we can do is pretty easily find unlabeled data."},{"from":570.99,"to":573.62,"location":2,"content":"Um, so it's actually a pretty solved problem, um,"},{"from":573.62,"to":577.01,"location":2,"content":"maybe not 100%, but we can with good accuracy look at"},{"from":577.01,"to":581.2,"location":2,"content":"some text and decide what language it's in and train a classifier to do that."},{"from":581.2,"to":583.61,"location":2,"content":"Um, so this means it's really easy to find"},{"from":583.61,"to":586.1,"location":2,"content":"data in any language you care about because you can just go on"},{"from":586.1,"to":588.44,"location":2,"content":"the web and essentially search for data in"},{"from":588.44,"to":595.24,"location":2,"content":"that language and acquire a large corpus of monolingual data."},{"from":595.24,"to":600.77,"location":2,"content":"Okay, um, I'm now going into the first approach,"},{"from":600.77,"to":603.1,"location":2,"content":"um, I'm going to talk about on using"},{"from":603.1,"to":606.37,"location":2,"content":"unlabeled data to improve machine translation models."},{"from":606.37,"to":609.41,"location":2,"content":"Um, this technique is called pre-training and it's"},{"from":609.41,"to":612.79,"location":2,"content":"really reminiscent of ideas like, um, ELMo."},{"from":612.79,"to":616.58,"location":2,"content":"Um, the idea is to pre-train by doing language modeling."},{"from":616.58,"to":618.35,"location":2,"content":"So if we have, um,"},{"from":618.35,"to":621.35,"location":2,"content":"two languages we're interested in translating,"},{"from":621.35,"to":622.53,"location":2,"content":"um, from one end to the other,"},{"from":622.53,"to":627.48,"location":2,"content":"we'll collect large datasets for both of those languages and then we can train,"},{"from":627.48,"to":629.04,"location":2,"content":"uh, two language models,"},{"from":629.04,"to":633.37,"location":2,"content":"one each on that data and then, um,"},{"from":633.37,"to":634.49,"location":2,"content":"we can use those, uh,"},{"from":634.49,"to":638.45,"location":2,"content":"pre-trained language models as initialization for a machine translation system."},{"from":638.45,"to":641.72,"location":2,"content":"Um, so the encoder will get initialized with"},{"from":641.72,"to":645.49,"location":2,"content":"the weights of the language model trained on the source side language, um,"},{"from":645.49,"to":649.83,"location":2,"content":"the decoder will get initialized with weights trained on the target size language, uh,"},{"from":649.83,"to":651.23,"location":2,"content":"and this will, um,"},{"from":651.23,"to":655.49,"location":2,"content":"improve the performance of your model because during this pre-training, um,"},{"from":655.49,"to":659.75,"location":2,"content":"we hope that our language models will be learning useful information such as, you know,"},{"from":659.75,"to":662.46,"location":2,"content":"the meaning of words or, um, uh,"},{"from":662.46,"to":665.25,"location":2,"content":"the kind of structure of the language, um,"},{"from":665.25,"to":669.02,"location":2,"content":"they are processing, um, and this can, uh,"},{"from":669.02,"to":672.41,"location":2,"content":"down the line help the machine translation model,"},{"from":672.41,"to":675.02,"location":2,"content":"um, when we fine tune it."},{"from":675.02,"to":677.46,"location":2,"content":"Um, let me pause here and ask if there are any questions,"},{"from":677.46,"to":678.62,"location":2,"content":"and just in general, feel,"},{"from":678.62,"to":685.92,"location":2,"content":"feel free to ask questions throughout this talk. Okay."},{"from":685.92,"to":693.38,"location":2,"content":"So, so here is a plot showing some results of this pre-training technique."},{"from":693.38,"to":696.04,"location":2,"content":"Um, so this is English to German translation."},{"from":696.04,"to":699.8,"location":2,"content":"Uh, the x-axis is how much training data,"},{"from":699.8,"to":701.92,"location":2,"content":"as in unsupervised training data, um,"},{"from":701.92,"to":703.08,"location":2,"content":"you provide these models,"},{"from":703.08,"to":705.36,"location":2,"content":"but of course they also have large amounts"},{"from":705.36,"to":708.94,"location":2,"content":"of monolingual data for this pre-training step."},{"from":708.94,"to":711.97,"location":2,"content":"And you can see that this works pretty well, right?"},{"from":711.97,"to":714.45,"location":2,"content":"So you've got about two blue points, um,"},{"from":714.45,"to":717.67,"location":2,"content":"increase in performance, so that's this red line above the blue line,"},{"from":717.67,"to":720.17,"location":2,"content":"um, when doing this pre-training technique."},{"from":720.17,"to":721.69,"location":2,"content":"And not too surprisingly,"},{"from":721.69,"to":730.35,"location":2,"content":"this gain is especially large when the amount of labeled data is small."},{"from":730.35,"to":734.08,"location":2,"content":"Um, there is a problem with,"},{"from":734.08,"to":737.26,"location":2,"content":"uh, pre-training which I want to address, which is that, uh,"},{"from":737.26,"to":738.85,"location":2,"content":"in pre-training, you have"},{"from":738.85,"to":740.89,"location":2,"content":"these two separate language models and there's never"},{"from":740.89,"to":743.03,"location":2,"content":"really any interaction between the two,"},{"from":743.03,"to":745.78,"location":2,"content":"um, when you're running them on the unlabeled corpus."},{"from":745.78,"to":748.43,"location":2,"content":"Um, so here's a simple technique, um,"},{"from":748.43,"to":752.49,"location":2,"content":"that tries to solve this problem and it's called self-training."},{"from":752.49,"to":757.09,"location":2,"content":"Um, the idea is given a sentence from our monolingual corpus,"},{"from":757.09,"to":760.21,"location":2,"content":"so in this case, \"I traveled to Belgium,\" that's an English sentence."},{"from":760.21,"to":765.4,"location":2,"content":"Um, we won't have a human provided translation for this sentence, uh,"},{"from":765.4,"to":768.92,"location":2,"content":"but what we can do is we can run our machine translation model,"},{"from":768.92,"to":772.75,"location":2,"content":"and we'll get a translation in the target language."},{"from":772.75,"to":776.32,"location":2,"content":"Um, since this is from a machine learning model it won't be perfect, uh,"},{"from":776.32,"to":780.16,"location":2,"content":"but we can hope that maybe our model can still learn from this kind"},{"from":780.16,"to":783.58,"location":2,"content":"of noisy labeled example, right?"},{"from":783.58,"to":785.27,"location":2,"content":"So we, we treat, um,"},{"from":785.27,"to":788.23,"location":2,"content":"our original monolingual sentence and it's machine-provided"},{"from":788.23,"to":792.49,"location":2,"content":"translation as though it were a human-provided translation and,"},{"from":792.49,"to":799.8,"location":2,"content":"uh, train our machine learning model as normal on this example."},{"from":799.8,"to":804.19,"location":2,"content":"Um, I think this seems pretty strange actually as- as"},{"from":804.19,"to":807.97,"location":2,"content":"a method when you first see it because it seems really circular, right?"},{"from":807.97,"to":811.32,"location":2,"content":"So if you look at this, um, the, uh,"},{"from":811.32,"to":813.85,"location":2,"content":"translation that the model is being trained to"},{"from":813.85,"to":818.1,"location":2,"content":"produce is actually exactly what it already produces to begin with,"},{"from":818.1,"to":823.42,"location":2,"content":"right, because, um, this translation came from our model in the first place."},{"from":823.42,"to":825.7,"location":2,"content":"Um, so actually in practice,"},{"from":825.7,"to":829.48,"location":2,"content":"this is not a technique that's very widely used due to this problem,"},{"from":829.48,"to":833.37,"location":2,"content":"um, but it motivates another technique called back-translation."},{"from":833.37,"to":836.74,"location":2,"content":"And this technique is really a very popular, um,"},{"from":836.74,"to":839.95,"location":2,"content":"solution to that problem, and it's the method, um,"},{"from":839.95,"to":844.24,"location":2,"content":"that has had a lot of success in using unlabeled data for translation."},{"from":844.24,"to":846.94,"location":2,"content":"So here's the approach rather than only"},{"from":846.94,"to":850.86,"location":2,"content":"having our translation system that goes from source language to target language,"},{"from":850.86,"to":853.21,"location":2,"content":"um, we're also going to train a model that"},{"from":853.21,"to":856.38,"location":2,"content":"goes from our target language to our source language."},{"from":856.38,"to":858.67,"location":2,"content":"And so in this case, if,"},{"from":858.67,"to":861.34,"location":2,"content":"if at the end of the day we want a French to English model, um,"},{"from":861.34,"to":864.91,"location":2,"content":"we're gonna start by actually training an English to French model."},{"from":864.91,"to":867.88,"location":2,"content":"And then we can do something that's a lot like self-labeling."},{"from":867.88,"to":870.21,"location":2,"content":"So we take a English sentence."},{"from":870.21,"to":873.37,"location":2,"content":"We run our English to French model and translate it."},{"from":873.37,"to":875.95,"location":2,"content":"The difference to what we did before is that"},{"from":875.95,"to":878.5,"location":2,"content":"we're actually going to switch the source and target side."},{"from":878.5,"to":882.64,"location":2,"content":"So now in this case the French sentence is the source sequence."},{"from":882.64,"to":885.99,"location":2,"content":"Uh, the target sequence is, um,"},{"from":885.99,"to":890.74,"location":2,"content":"our original English sentence that came from monolingual corpora."},{"from":890.74,"to":892.16,"location":2,"content":"And now we're training the language, uh,"},{"from":892.16,"to":894.04,"location":2,"content":"the machine translation system that goes"},{"from":894.04,"to":897.26,"location":2,"content":"the other direction so that goes French to English."},{"from":897.26,"to":900.45,"location":2,"content":"Um, so, so why do we think this will work better?"},{"from":900.45,"to":902.32,"location":2,"content":"Um, number one, um,"},{"from":902.32,"to":905.23,"location":2,"content":"there's no longer this kind of circularity to the training"},{"from":905.23,"to":910.21,"location":2,"content":"because what the model is being trained on is the output of a completely different model."},{"from":910.21,"to":914.85,"location":2,"content":"Um, another thing that I think is pretty crucial here is that,"},{"from":914.85,"to":918.97,"location":2,"content":"um, the translations, the model is trained to produce."},{"from":918.97,"to":921.52,"location":2,"content":"So the things that the decoder is actually learning to"},{"from":921.52,"to":924.43,"location":2,"content":"generate are never bad translations, right?"},{"from":924.43,"to":926.57,"location":2,"content":"So if you look at this example,"},{"from":926.57,"to":929.54,"location":2,"content":"the target sequence for our French to English model,"},{"from":929.54,"to":931.16,"location":2,"content":"I traveled to Belgium, um,"},{"from":931.16,"to":934.64,"location":2,"content":"that originally came from a monolingual corpus."},{"from":934.64,"to":937.42,"location":2,"content":"Um, so I think intuitively this makes sense is"},{"from":937.42,"to":940.43,"location":2,"content":"that if we want to train a good translation model,"},{"from":940.43,"to":944.62,"location":2,"content":"um, it's probably okay to expose it to noisy inputs."},{"from":944.62,"to":947.51,"location":2,"content":"So we expose it to the output of a system that's English to French,"},{"from":947.51,"to":948.73,"location":2,"content":"it might not be perfect."},{"from":948.73,"to":952.33,"location":2,"content":"Um, but what we don't want to do is um, expose it to"},{"from":952.33,"to":954.85,"location":2,"content":"poor target sequences because then it"},{"from":954.85,"to":958.56,"location":2,"content":"won't learn how to generate in that language effectively."},{"from":958.56,"to":964.3,"location":2,"content":"Any questions on back-translation before I get to results? Um, sure."},{"from":964.3,"to":968.98,"location":2,"content":"[BACKGROUND]"},{"from":968.98,"to":971.5,"location":2,"content":"So this is assuming we have a large corpus of"},{"from":971.5,"to":977.33,"location":2,"content":"unlabeled data and we want to be using it to help our translation model."},{"from":977.33,"to":979.88,"location":2,"content":"Does that, does that make sense?"},{"from":979.88,"to":983.34,"location":2,"content":"Um, maybe you could clarify the question."},{"from":983.34,"to":989.16,"location":2,"content":"[BACKGROUND]"},{"from":989.16,"to":992.83,"location":2,"content":"Yeah, that's right. So we have a big corpus of English which includes the sentence,"},{"from":992.83,"to":996.19,"location":2,"content":"\"I traveled to Belgium,\" and we don't know the translations but we'd still like to"},{"from":996.19,"to":999.63,"location":2,"content":"use this data. Yeah, another question."},{"from":999.63,"to":1005.28,"location":2,"content":"[BACKGROUND]"},{"from":1005.28,"to":1007.11,"location":2,"content":"Yeah, so that's a good question is how do you"},{"from":1007.11,"to":1012.3,"location":2,"content":"avoid both the models let's say sort of blowing up and producing garbage?"},{"from":1012.3,"to":1014.4,"location":2,"content":"And then they're just feeding garbage to each other."},{"from":1014.4,"to":1017.82,"location":2,"content":"The answer is that there is some amount of labeled data here as well."},{"from":1017.82,"to":1020.82,"location":2,"content":"So on unlabeled data you do this, but on labeled data,"},{"from":1020.82,"to":1022.11,"location":2,"content":"you do standard training,"},{"from":1022.11,"to":1024.8,"location":2,"content":"and that way you avoid, you,"},{"from":1024.8,"to":1027.9,"location":2,"content":"you make sure you kind of keep the models on track because they still have to fit to"},{"from":1027.9,"to":1032.17,"location":2,"content":"the labeled data. Yeah, another question."},{"from":1032.17,"to":1035.47,"location":2,"content":"How do you schedule the training of the two models?"},{"from":1035.47,"to":1037.5,"location":2,"content":"Yeah, that is a good question."},{"from":1037.5,"to":1041.58,"location":2,"content":"And I think that's basically almost like a hyper-parameter you can tweak."},{"from":1041.58,"to":1045.72,"location":2,"content":"So I think a pretty common thing to do is first,"},{"from":1045.72,"to":1048.27,"location":2,"content":"train two models only on labeled data."},{"from":1048.27,"to":1052.96,"location":2,"content":"Then label, um, so then do back-translation"},{"from":1052.96,"to":1057.48,"location":2,"content":"over a large corpus and kind of repeat that process over and over again."},{"from":1057.48,"to":1060.16,"location":2,"content":"So each iteration, you train on the label data,"},{"from":1060.16,"to":1063.51,"location":2,"content":"label some unlabeled data and now you have more data to work with."},{"from":1063.51,"to":1066.27,"location":2,"content":"But I think there'd be many kinds of scheduling that would be effective"},{"from":1066.27,"to":1070.38,"location":2,"content":"here. Okay. Another question."},{"from":1070.38,"to":1086.1,"location":2,"content":"I'm curious as to the evaluation, considering if you have a very good French to English model, you could try to look up, or contest if you have a good French to English model, you could try to look up the original source and see if it matches."},{"from":1086.1,"to":1087.43,"location":2,"content":"Yeah, I'm not, I'm not quite sure."},{"from":1087.43,"to":1090.13,"location":2,"content":"Are you suggesting going like English to French to English and seeing if?"},{"from":1090.13,"to":1091.63,"location":2,"content":"I see, yeah, yeah,"},{"from":1091.63,"to":1092.78,"location":2,"content":"that's a really interesting idea."},{"from":1092.78,"to":1095.78,"location":2,"content":"And we're actually going to talk a little bit about this sort of,"},{"from":1095.78,"to":1097.29,"location":2,"content":"it's called cycle consistency,"},{"from":1097.29,"to":1100.97,"location":2,"content":"this idea later in this talk."},{"from":1100.97,"to":1103.77,"location":2,"content":"Okay, I'm going to move on to the results."},{"from":1103.77,"to":1108.12,"location":2,"content":"So, so here's the method for using unlabeled data to improve translation."},{"from":1108.12,"to":1109.89,"location":2,"content":"How well does it do?"},{"from":1109.89,"to":1113.22,"location":2,"content":"Um, the answer is that the improvements are at least to me, they"},{"from":1113.22,"to":1116.49,"location":2,"content":"were surprisingly extremely good, right?"},{"from":1116.49,"to":1119.44,"location":2,"content":"So, um, this is for English to German translation."},{"from":1119.44,"to":1124.52,"location":2,"content":"This is from some work by Facebook, so they used 5 million labeled sentence pairs."},{"from":1124.52,"to":1132.29,"location":2,"content":"But they also used 230 monolingual sentences, so sentences without translations."},{"from":1132.29,"to":1136.42,"location":2,"content":"And you can see that compared to previous state of the art,"},{"from":1136.42,"to":1139.76,"location":2,"content":"they get six BLEU points improvement which, um,"},{"from":1139.76,"to":1143.01,"location":2,"content":"if you compare it to most previous research and machine tran- machine translation"},{"from":1143.01,"to":1144.18,"location":2,"content":"is a really big gain, right?"},{"from":1144.18,"to":1148.02,"location":2,"content":"So even something like the invention of the transformer which most people would"},{"from":1148.02,"to":1153.16,"location":2,"content":"consider to be a really significant research development in NLP,"},{"from":1153.16,"to":1156.83,"location":2,"content":"that improved over prior work by about 2.5 BLEU points."},{"from":1156.83,"to":1162.33,"location":2,"content":"And here without doing any sort of fancy model design just by using way more data,"},{"from":1162.33,"to":1169.13,"location":2,"content":"um, we get actually much larger improvements."},{"from":1169.13,"to":1174.39,"location":2,"content":"Okay. So an interesting question to think about,"},{"from":1174.39,"to":1178.13,"location":2,"content":"um, is suppose we only have our monolingual corpora."},{"from":1178.13,"to":1181.15,"location":2,"content":"So we don't have any sentences that had been human translated."},{"from":1181.15,"to":1183.39,"location":2,"content":"We just have sentences in two languages."},{"from":1183.39,"to":1187.08,"location":2,"content":"Um, so the scenario you can sort of imagine is suppose,"},{"from":1187.08,"to":1188.98,"location":2,"content":"um, an alien comes down and,"},{"from":1188.98,"to":1190.74,"location":2,"content":"um, starts talking to you and it's a"},{"from":1190.74,"to":1193.96,"location":2,"content":"weird alien language, um, and it talks a lot,"},{"from":1193.96,"to":1198.12,"location":2,"content":"would you eventually be able to translate what it's saying to English,"},{"from":1198.12,"to":1203.3,"location":2,"content":"um, just by having a really large amount of data?"},{"from":1203.3,"to":1206.2,"location":2,"content":"Um, so I'm going to start with, um,"},{"from":1206.2,"to":1211.93,"location":2,"content":"a simpler task than full-on translating when you only have unlabeled sentences."},{"from":1211.93,"to":1215.22,"location":2,"content":"Um, instead of doing sentence to sentence translation,"},{"from":1215.22,"to":1218.64,"location":2,"content":"let's start by only worrying about word to word translation."},{"from":1218.64,"to":1221.49,"location":2,"content":"So the goal here is given a word in one language,"},{"from":1221.49,"to":1225.33,"location":2,"content":"find its translation but without using any labeled data."},{"from":1225.33,"to":1227.1,"location":2,"content":"Um, and the method,"},{"from":1227.1,"to":1229.44,"location":2,"content":"the method we're going to use to try to solve"},{"from":1229.44,"to":1233.46,"location":2,"content":"this task is called, uh, cross-lingual embeddings."},{"from":1233.46,"to":1235.83,"location":2,"content":"Um, so the goal is to learn, uh,"},{"from":1235.83,"to":1239.27,"location":2,"content":"word vectors for words in both languages,"},{"from":1239.27,"to":1241.93,"location":2,"content":"and we'd like those word vectors to have"},{"from":1241.93,"to":1245.55,"location":2,"content":"all the nice properties you've already learned about word vectors having, um,"},{"from":1245.55,"to":1249.15,"location":2,"content":"but we also want word vectors for a particular language,"},{"from":1249.15,"to":1252.86,"location":2,"content":"um, to be close to the word vector of its translation."},{"from":1252.86,"to":1257.09,"location":2,"content":"Um, so I'm not sure if it's visible in this figure but this fis- figure shows"},{"from":1257.09,"to":1262.47,"location":2,"content":"a large number of English and I think German words and you can see that,"},{"from":1262.47,"to":1267.8,"location":2,"content":"um, uh, the each English word has its corresponding German word,"},{"from":1267.8,"to":1270.33,"location":2,"content":"um, nearby to it in its embedding space."},{"from":1270.33,"to":1275.01,"location":2,"content":"So if we learn embeddings like this then it's pretty easy to do word to word translation."},{"from":1275.01,"to":1276.7,"location":2,"content":"Um, we just pick an English word,"},{"from":1276.7,"to":1278.55,"location":2,"content":"we find the nearest, uh,"},{"from":1278.55,"to":1282.08,"location":2,"content":"German word in this joint embedding space"},{"from":1282.08,"to":1288.47,"location":2,"content":"and that will give us a translation for the English word."},{"from":1288.47,"to":1292.18,"location":2,"content":"Um, our key method for or the key"},{"from":1292.18,"to":1295.5,"location":2,"content":"assumption that we're going to be using to solve this is that,"},{"from":1295.5,"to":1300.87,"location":2,"content":"um, th- even though if you run word2vec twice you'll get really different embeddings."},{"from":1300.87,"to":1306.93,"location":2,"content":"Um, the structure of that embedding space has a lot of regularity to it,"},{"from":1306.93,"to":1309.67,"location":2,"content":"and we can take advantage of that regularity, um,"},{"from":1309.67,"to":1311.7,"location":2,"content":"to help find when,"},{"from":1311.7,"to":1314.37,"location":2,"content":"um, an alignment between those embedding spaces."},{"from":1314.37,"to":1316.83,"location":2,"content":"So to be kind of more concrete here."},{"from":1316.83,"to":1319.56,"location":2,"content":"Here is a picture of two sets of word embeddings."},{"from":1319.56,"to":1320.82,"location":2,"content":"So in red, we have, um,"},{"from":1320.82,"to":1322.65,"location":2,"content":"English words, in, uh,"},{"from":1322.65,"to":1324.57,"location":2,"content":"blue we have Italian words,"},{"from":1324.57,"to":1329.28,"location":2,"content":"and although, um, the vector spaces right now look very different to each other,"},{"from":1329.28,"to":1332.4,"location":2,"content":"um, you can see that they have a really similar structure, right?"},{"from":1332.4,"to":1336.73,"location":2,"content":"So you'd imagine distances are kind of similar that the distance from,"},{"from":1336.73,"to":1339.35,"location":2,"content":"uh, cat and feline in the, um,"},{"from":1339.35,"to":1342.57,"location":2,"content":"English embedding space should be pretty similar to the distance"},{"from":1342.57,"to":1347.88,"location":2,"content":"between gatto and felino in the, um, Italian space."},{"from":1347.88,"to":1355.4,"location":2,"content":"Um, this kind of motivates an algorithm for learning these cross-lingual embeddings."},{"from":1355.4,"to":1358.44,"location":2,"content":"Um, so here's the idea."},{"from":1358.44,"to":1360.96,"location":2,"content":"What we're going to try to do is learn what's essentially"},{"from":1360.96,"to":1364.08,"location":2,"content":"a rotation such that we can transform,"},{"from":1364.08,"to":1366.66,"location":2,"content":"um, our set of English embeddings so"},{"from":1366.66,"to":1370.52,"location":2,"content":"that they match up with our Italian embe- embeddings."},{"from":1370.52,"to":1372.78,"location":2,"content":"So mathematically, what this means is we're gonna learn"},{"from":1372.78,"to":1375.66,"location":2,"content":"a matrix W such that if we take let's say,"},{"from":1375.66,"to":1380.36,"location":2,"content":"uh, the word vector for cat in English and we multiply it by W, um,"},{"from":1380.36,"to":1386.2,"location":2,"content":"we end up with the vector for gatto in Spanish or Italian,"},{"from":1386.2,"to":1389.55,"location":2,"content":"um, and a detail here is that, um,"},{"from":1389.55,"to":1392.58,"location":2,"content":"we're going to constrain W to be orthogonal, um,"},{"from":1392.58,"to":1395.07,"location":2,"content":"and what that means geometrically is just that W is"},{"from":1395.07,"to":1397.98,"location":2,"content":"only going to be doing a rotation to the,"},{"from":1397.98,"to":1399.94,"location":2,"content":"uh, vectors, um, in X."},{"from":1399.94,"to":1404.87,"location":2,"content":"It's not going to be doing some other weirder transformation."},{"from":1404.87,"to":1409.31,"location":2,"content":"So this is our goal is to learn this W. Um,"},{"from":1409.31,"to":1411,"location":2,"content":"next I'm gonna talk about,"},{"from":1411,"to":1416.98,"location":2,"content":"talking about how actually do we learn this W. Um,"},{"from":1416.98,"to":1421.66,"location":2,"content":"and there's actually a bunch of techniques for learning this W matrix,"},{"from":1421.66,"to":1424.74,"location":2,"content":"um, but, um, here is one of"},{"from":1424.74,"to":1428.31,"location":2,"content":"them that I think is quite clever is called adversarial training."},{"from":1428.31,"to":1430.63,"location":2,"content":"Um, so it works as follows,"},{"from":1430.63,"to":1433.77,"location":2,"content":"is in addition to trying to learn this W matrix,"},{"from":1433.77,"to":1437.67,"location":2,"content":"we're also going to be trying to learn a model that, uh,"},{"from":1437.67,"to":1438.91,"location":2,"content":"is called a discriminator,"},{"from":1438.91,"to":1442.8,"location":2,"content":"and what it'll do is take a vector and it will try to predict,"},{"from":1442.8,"to":1445.08,"location":2,"content":"is that vector originally, um,"},{"from":1445.08,"to":1448.83,"location":2,"content":"an English word embedding or is it originally an Italian word embedding?"},{"from":1448.83,"to":1451.42,"location":2,"content":"Um, in other words, if you think about, um,"},{"from":1451.42,"to":1454.92,"location":2,"content":"the diagram, what we're asking our discriminator to do is, uh,"},{"from":1454.92,"to":1457.68,"location":2,"content":"it's given one of these points and it's trying to predict is it"},{"from":1457.68,"to":1461.06,"location":2,"content":"basically a red point so an English word originally, or is it a blue point?"},{"from":1461.06,"to":1464.01,"location":2,"content":"Um, so if we have no W matrix and this is"},{"from":1464.01,"to":1467.19,"location":2,"content":"a really easy task for the discriminator because,"},{"from":1467.19,"to":1472.42,"location":2,"content":"um, the, uh, word embeddings for English and Italian are clearly separated."},{"from":1472.42,"to":1476.13,"location":2,"content":"Um, however, if we learn a W matrix"},{"from":1476.13,"to":1479.95,"location":2,"content":"that succeeds in aligning all these embeddings on top of each other,"},{"from":1479.95,"to":1483.27,"location":2,"content":"then our discriminator will never do a good job, right."},{"from":1483.27,"to":1486.21,"location":2,"content":"We can imagine it'll never really do better than 50%,"},{"from":1486.21,"to":1488.84,"location":2,"content":"um, because given a vector for say cat,"},{"from":1488.84,"to":1491.19,"location":2,"content":"it won't know is that the vector for cat that's been"},{"from":1491.19,"to":1494.13,"location":2,"content":"transformed by W or is it actually the vector for gatto?"},{"from":1494.13,"to":1498.88,"location":2,"content":"Um, because in this case those two vectors are aligned so they are on top of each other."},{"from":1498.88,"to":1503.71,"location":2,"content":"Um, so, um, during training, you first, um,"},{"from":1503.71,"to":1506.79,"location":2,"content":"you alternate between training the discriminator a little bit which"},{"from":1506.79,"to":1509.64,"location":2,"content":"means making sure it's as good as possible at"},{"from":1509.64,"to":1513.12,"location":2,"content":"distinguishing the English from Italian words and then you"},{"from":1513.12,"to":1516.93,"location":2,"content":"train the W and the goal for training W is to,"},{"from":1516.93,"to":1520.05,"location":2,"content":"uh, essentially confuse the discriminator as much as possible."},{"from":1520.05,"to":1523.21,"location":2,"content":"Um, so you want to have a situation where,"},{"from":1523.21,"to":1526.17,"location":2,"content":"um, you can't, um, with this machine learning model,"},{"from":1526.17,"to":1529.29,"location":2,"content":"figure out if a word embedding actually, um,"},{"from":1529.29,"to":1533.63,"location":2,"content":"was, um, originally from English or if it's an Italian word vector."},{"from":1533.63,"to":1536.09,"location":2,"content":"Um, and so at the end of the day you have,"},{"from":1536.09,"to":1539.42,"location":2,"content":"you have vectors that are kind of aligned with each other."},{"from":1539.42,"to":1547.22,"location":2,"content":"Um, any questions about this approach?"},{"from":1547.22,"to":1550.65,"location":2,"content":"Okay. Um, he- there's a link to a paper with more details."},{"from":1550.65,"to":1553.28,"location":2,"content":"There's actually kind of a range of other tricks you can do,"},{"from":1553.28,"to":1558.44,"location":2,"content":"um, but this is kind of a key idea."},{"from":1558.44,"to":1564.81,"location":2,"content":"Um, okay. So that was doing word to word unsupervised translation."},{"from":1564.81,"to":1569.15,"location":2,"content":"Um, how do we do full sentence to sentence translation?"},{"from":1569.15,"to":1571.72,"location":2,"content":"Um, so we're going to use, um,"},{"from":1571.72,"to":1573.75,"location":2,"content":"a standard sort of seq2seq model,"},{"from":1573.75,"to":1576.66,"location":2,"content":"um, without even an attention mechanism."},{"from":1576.66,"to":1579.9,"location":2,"content":"Um, there's one change to the standard seq2seq"},{"from":1579.9,"to":1583.05,"location":2,"content":"model going on here which is that, um,"},{"from":1583.05,"to":1585.78,"location":2,"content":"we're going to use the same encoder and decoder,"},{"from":1585.78,"to":1590.16,"location":2,"content":"uh, regardless of the input and output languages."},{"from":1590.16,"to":1591.93,"location":2,"content":"So you can see, um,"},{"from":1591.93,"to":1593.34,"location":2,"content":"in this example, um,"},{"from":1593.34,"to":1595.82,"location":2,"content":"we could give the encoder an English sentence,"},{"from":1595.82,"to":1600.36,"location":2,"content":"we could also give it a French sentence and it'll have these cross-lingual embeddings."},{"from":1600.36,"to":1603.26,"location":2,"content":"So it'll have vector representations for English words"},{"from":1603.26,"to":1607.14,"location":2,"content":"and French words which means it can handle sort of any input."},{"from":1607.14,"to":1609.38,"location":2,"content":"Um, for the decoder,"},{"from":1609.38,"to":1612.93,"location":2,"content":"we need to give it some information about what language is it supposed to generate in."},{"from":1612.93,"to":1614.95,"location":2,"content":"Is it going to generate in French or English?"},{"from":1614.95,"to":1618.66,"location":2,"content":"Um, so the way that is done is by, uh,"},{"from":1618.66,"to":1621.91,"location":2,"content":"feeding in a special token which here is Fr"},{"from":1621.91,"to":1625.59,"location":2,"content":"in brack- brackets to represent French that tells the model,"},{"from":1625.59,"to":1627.97,"location":2,"content":"okay, you should generate in French now."},{"from":1627.97,"to":1631.38,"location":2,"content":"Um, here in this figure it's only French,"},{"from":1631.38,"to":1633.97,"location":2,"content":"but you could imagine also feeding this model, uh,"},{"from":1633.97,"to":1637.63,"location":2,"content":"English in brackets, and then that'll tell it to, uh, generate English."},{"from":1637.63,"to":1641.67,"location":2,"content":"And one thing that you can see is that you could use this sort of model to g enerate,"},{"from":1641.67,"to":1643.15,"location":2,"content":"do go from English to French."},{"from":1643.15,"to":1645.45,"location":2,"content":"You could also use this model as an auto-encoder, right."},{"from":1645.45,"to":1647.3,"location":2,"content":"So, uh, at the bottom, um,"},{"from":1647.3,"to":1651.51,"location":2,"content":"it's taking in a French sentence as input and it's just generating French as"},{"from":1651.51,"to":1658.85,"location":2,"content":"output which here means just reproducing the original input sequence."},{"from":1658.85,"to":1663.1,"location":2,"content":"Um, so just a small change to standard seq2seq."},{"from":1663.1,"to":1666.77,"location":2,"content":"Here's how we're going to train the seq2seq model."},{"from":1666.77,"to":1670.17,"location":2,"content":"Um, there's going to be two training objectives, um,"},{"from":1670.17,"to":1671.94,"location":2,"content":"and I'll explain sort of why they're, uh,"},{"from":1671.94,"to":1675.06,"location":2,"content":"present in this model in just a few slides."},{"from":1675.06,"to":1677.03,"location":2,"content":"For now let's just say what they are."},{"from":1677.03,"to":1679.11,"location":2,"content":"So the first one is, um,"},{"from":1679.11,"to":1681.16,"location":2,"content":"called a de-noising autoencoder."},{"from":1681.16,"to":1686.43,"location":2,"content":"Um, what we're going to train our model to do in this case is take a, uh, sentence."},{"from":1686.43,"to":1688.14,"location":2,"content":"So, um, and here it's going to be"},{"from":1688.14,"to":1690.8,"location":2,"content":"an English sentence but it could also be a French sentence."},{"from":1690.8,"to":1694.17,"location":2,"content":"Um, we're going to scramble up the words a little bit,"},{"from":1694.17,"to":1696.88,"location":2,"content":"and then we're going to ask the model to, uh,"},{"from":1696.88,"to":1700.56,"location":2,"content":"de-noise that sentence which in other words means"},{"from":1700.56,"to":1705.35,"location":2,"content":"regenerating what the sentence actually was before it was scrambled."},{"from":1705.35,"to":1711.74,"location":2,"content":"And, uh, maybe one idea of why this would be a useful training objective is that,"},{"from":1711.74,"to":1715.51,"location":2,"content":"uh, since we have an encoder-decoder without atten- attention,"},{"from":1715.51,"to":1721.78,"location":2,"content":"the encoder is converting the entirety of the source sentence into a single vector,"},{"from":1721.78,"to":1727.11,"location":2,"content":"what an auto-encoder does is ensure that that vector contains all the information about"},{"from":1727.11,"to":1732.39,"location":2,"content":"the sentence such that we are able to recover what the original sentence was,"},{"from":1732.39,"to":1737.96,"location":2,"content":"um, from the vector produced by the encoder."},{"from":1737.96,"to":1740.81,"location":2,"content":"Um, so that was objective 1."},{"from":1740.81,"to":1745.01,"location":2,"content":"Training objective 2 is now we're actually going to be trying to do a translation,"},{"from":1745.01,"to":1747.48,"location":2,"content":"um, but, um, as before,"},{"from":1747.48,"to":1749.78,"location":2,"content":"we're going to be using this back-translation idea."},{"from":1749.78,"to":1752.97,"location":2,"content":"So remember, we only have unlabeled sentences,"},{"from":1752.97,"to":1756.02,"location":2,"content":"we don't have any human-provided translations,"},{"from":1756.02,"to":1759.75,"location":2,"content":"um, but what we can still do is, given, a,"},{"from":1759.75,"to":1762,"location":2,"content":"um, let's say an English sentence or let's say a French sentence,"},{"from":1762,"to":1764.51,"location":2,"content":"given a French sentence, we can translate it to English, um,"},{"from":1764.51,"to":1768.12,"location":2,"content":"using our model in its current state, uh,"},{"from":1768.12,"to":1772.61,"location":2,"content":"and then we can ask that model to translate from English or translate that- yeah,"},{"from":1772.61,"to":1774.69,"location":2,"content":"translate that English back into French."},{"from":1774.69,"to":1777.11,"location":2,"content":"Um, so what you can imagine is in this setting, um,"},{"from":1777.11,"to":1779.64,"location":2,"content":"the input sequence is going to be somewhat messed"},{"from":1779.64,"to":1782.82,"location":2,"content":"up because it's the output of our imperfect machine learning model."},{"from":1782.82,"to":1787.05,"location":2,"content":"So here the input sequence is just \"I am student,\" um, a word has been dropped,"},{"from":1787.05,"to":1791.82,"location":2,"content":"but, um, we're now gonna train it to, even with this kind of bad input,"},{"from":1791.82,"to":1795.33,"location":2,"content":"to reproduce the original, um,"},{"from":1795.33,"to":1798.27,"location":2,"content":"French sentence, um, from our,"},{"from":1798.27,"to":1801.27,"location":2,"content":"uh, corpus of- of monolingual, um, French text."},{"from":1801.27,"to":1808.91,"location":2,"content":"[NOISE] Um, let me- let me pause here actually and ask for questions."},{"from":1808.91,"to":1813.84,"location":2,"content":"Sure."},{"from":1813.84,"to":1816,"location":2,"content":"[NOISE] [inaudible] What if, um, the reason you have"},{"from":1816,"to":1820.31,"location":2,"content":"this orthogonality constraint for your words to be word embedding,"},{"from":1820.31,"to":1822.9,"location":2,"content":"is it to avoid overfitting?"},{"from":1822.9,"to":1829.8,"location":2,"content":"Have you tried to take that off, and you know, see what [inaudible]"},{"from":1829.8,"to":1831.06,"location":2,"content":"Yeah. That's a good question."},{"from":1831.06,"to":1835.31,"location":2,"content":"Um, so this is going back to earlier when there was a word-word translation."},{"from":1835.31,"to":1839.33,"location":2,"content":"Why would we constrain that W matrix to be orthogonal?"},{"from":1839.33,"to":1843.03,"location":2,"content":"Um, essentially, that's right. It's to avoid overfitting and in particular,"},{"from":1843.03,"to":1846.06,"location":2,"content":"it's making this assumption that our embedding spaces are so"},{"from":1846.06,"to":1850.01,"location":2,"content":"similar that there's actually just a rotation that distinguishes,"},{"from":1850.01,"to":1853.5,"location":2,"content":"um, our word vectors in English versus our word vectors in Italian."},{"from":1853.5,"to":1857.46,"location":2,"content":"Um, I think there has been, um,"},{"from":1857.46,"to":1861.36,"location":2,"content":"there have been results that don't include that orthogonality constraint,"},{"from":1861.36,"to":1864.48,"location":2,"content":"and I think it slightly hurts performance to not have that in there."},{"from":1864.48,"to":1869.13,"location":2,"content":"[NOISE] Okay."},{"from":1869.13,"to":1871.15,"location":2,"content":"Um, so- so continuing with,"},{"from":1871.15,"to":1873.77,"location":2,"content":"um, unsupervised machine translation,"},{"from":1873.77,"to":1877.29,"location":2,"content":"um, I- I gave a training method."},{"from":1877.29,"to":1880.39,"location":2,"content":"I didn't quite explain why it would work, so- so,"},{"from":1880.39,"to":1884.79,"location":2,"content":"um, here is some more intuition for- for this idea."},{"from":1884.79,"to":1887.73,"location":2,"content":"Um, so remember, um,"},{"from":1887.73,"to":1889.66,"location":2,"content":"we're going to initialize"},{"from":1889.66,"to":1893.26,"location":2,"content":"our machine translation model with these cross-lingual embeddings,"},{"from":1893.26,"to":1897.02,"location":2,"content":"which mean the English and French word should look close to identically."},{"from":1897.02,"to":1902.57,"location":2,"content":"Um, we're also using the shared, um, encoder."},{"from":1902.57,"to":1904.86,"location":2,"content":"Um, so that means if you think about it,"},{"from":1904.86,"to":1906.64,"location":2,"content":"um, at the top, we have just,"},{"from":1906.64,"to":1911.76,"location":2,"content":"a auto-encoding objective and we can certainly believe that our model can learn this."},{"from":1911.76,"to":1914.25,"location":2,"content":"Um, it's a pretty simple task."},{"from":1914.25,"to":1919.39,"location":2,"content":"Um, now imagine we're giving our model a French sentence as input instead."},{"from":1919.39,"to":1921.56,"location":2,"content":"Um, since the, uh,"},{"from":1921.56,"to":1923.85,"location":2,"content":"embeddings are going to look pretty similar,"},{"from":1923.85,"to":1926.19,"location":2,"content":"and since the encoder is the same, um,"},{"from":1926.19,"to":1929.76,"location":2,"content":"it's pretty likely that the model's representation of"},{"from":1929.76,"to":1931.95,"location":2,"content":"this French sentence should actually be very"},{"from":1931.95,"to":1935.52,"location":2,"content":"similar to the representation of the English sentence."},{"from":1935.52,"to":1939.87,"location":2,"content":"Um, so when this representation is passed into the decoder, um,"},{"from":1939.87,"to":1945.2,"location":2,"content":"we can hope that we'll get the same output as before."},{"from":1945.2,"to":1948.49,"location":2,"content":"Um, um, so here's like sort of as a starting point."},{"from":1948.49,"to":1950.87,"location":2,"content":"We- we can hope that our model, um,"},{"from":1950.87,"to":1953.43,"location":2,"content":"already is able to have some translation capability."},{"from":1953.43,"to":1957.84,"location":2,"content":"[NOISE] Um, another way of thinking about this is"},{"from":1957.84,"to":1962.36,"location":2,"content":"that what we really want our model to do is to be able to encode a sentence,"},{"from":1962.36,"to":1964.34,"location":2,"content":"such that the representation,"},{"from":1964.34,"to":1967.41,"location":2,"content":"um, is sort of a universal kind of Interlingua."},{"from":1967.41,"to":1969.88,"location":2,"content":"So a universal, um, uh,"},{"from":1969.88,"to":1973.68,"location":2,"content":"universal representation of that sentence that doesn't,"},{"from":1973.68,"to":1976.24,"location":2,"content":"uh, that's not specific to the language."},{"from":1976.24,"to":1979.79,"location":2,"content":"And so- so here's kind of a picture that's trying to get at this."},{"from":1979.79,"to":1983.16,"location":2,"content":"So our autoencoder, um, and our, um,"},{"from":1983.16,"to":1985.29,"location":2,"content":"here in our back-translation example,"},{"from":1985.29,"to":1987.21,"location":2,"content":"um, here, the target sequence is the same."},{"from":1987.21,"to":1990.09,"location":2,"content":"[NOISE] Um, so what that essentially means is"},{"from":1990.09,"to":1994.2,"location":2,"content":"that the vectors for the English sentence and the French sentence,"},{"from":1994.2,"to":1997.41,"location":2,"content":"um, are going to be trained to be the same, um, right?"},{"from":1997.41,"to":1999.64,"location":2,"content":"Because if they are different, our, uh,"},{"from":1999.64,"to":2001.52,"location":2,"content":"decoder would be generating different,"},{"from":2001.52,"to":2005.04,"location":2,"content":"uh, outputs on these two examples."},{"from":2005.04,"to":2009.64,"location":2,"content":"Um, so here- this is just another sort of intuition is that what our model is"},{"from":2009.64,"to":2011.29,"location":2,"content":"trying to learn here is kind of a way of"},{"from":2011.29,"to":2013.87,"location":2,"content":"encoding the information of a sentence in a vector,"},{"from":2013.87,"to":2017.1,"location":2,"content":"um, but in a way that is language-agnostic."},{"from":2017.1,"to":2019.46,"location":2,"content":"Um, any more questions about,"},{"from":2019.46,"to":2024.22,"location":2,"content":"uh, unsupervised machine translation?"},{"from":2024.22,"to":2030.35,"location":2,"content":"Okay. Um, so going on to results of this approach, um,"},{"from":2030.35,"to":2033.02,"location":2,"content":"here, the horizontal lines are,"},{"from":2033.02,"to":2036.86,"location":2,"content":"um, the results of an unsupervised machine translation model."},{"from":2036.86,"to":2040.78,"location":2,"content":"Um, the lines that go up are for a supervised machine translation model,"},{"from":2040.78,"to":2043.9,"location":2,"content":"um, as we give it more and more data."},{"from":2043.9,"to":2046.3,"location":2,"content":"Right? So unsurprisingly, um,"},{"from":2046.3,"to":2049.41,"location":2,"content":"given a large amount of supervised data, um,"},{"from":2049.41,"to":2051.79,"location":2,"content":"the supervised machine translation models"},{"from":2051.79,"to":2055.72,"location":2,"content":"work much better than the unsupervised machine translation model."},{"from":2055.72,"to":2059.28,"location":2,"content":"Um, but, um, the unsupervised machine translation model,"},{"from":2059.28,"to":2061.31,"location":2,"content":"actually still does quite well."},{"from":2061.31,"to":2066.99,"location":2,"content":"Um, so if you see it around 10,000 to 100,000 training examples,"},{"from":2066.99,"to":2070.57,"location":2,"content":"um, it actually does just as well or better than supervised translation,"},{"from":2070.57,"to":2073.58,"location":2,"content":"and I think that's a really promising result,"},{"from":2073.58,"to":2076.64,"location":2,"content":"uh, because if you think of, um,"},{"from":2076.64,"to":2079.55,"location":2,"content":"low-resource settings where there isn't much labeled examples, um,"},{"from":2079.55,"to":2082.28,"location":2,"content":"it suddenly becomes really nice that you can perform this well,"},{"from":2082.28,"to":2088.69,"location":2,"content":"um, without even needing to use a training set."},{"from":2088.69,"to":2091.74,"location":2,"content":"Um, another thing kind of fun you can do with,"},{"from":2091.74,"to":2095.2,"location":2,"content":"an unsupervised machine translation model is attribute transfer."},{"from":2095.2,"to":2098.86,"location":2,"content":"Um, so basically, you can, um, take, uh,"},{"from":2098.86,"to":2100.52,"location":2,"content":"collections of texts that,"},{"from":2100.52,"to":2103.19,"location":2,"content":"uh, split by any attribute you want."},{"from":2103.19,"to":2104.9,"location":2,"content":"So for example, you could go on Twitter,"},{"from":2104.9,"to":2108.65,"location":2,"content":"look at hashtags to decide which tweets are annoyed and which tweets are relaxed,"},{"from":2108.65,"to":2111.08,"location":2,"content":"and then you can treat those two corpora as"},{"from":2111.08,"to":2113.61,"location":2,"content":"text as though they were two different languages,"},{"from":2113.61,"to":2116.51,"location":2,"content":"and you can train an unsupervised machine translation model,"},{"from":2116.51,"to":2119.16,"location":2,"content":"uh, to convert from one to the other."},{"from":2119.16,"to":2122.49,"location":2,"content":"Uh, and you can see these examples, um,"},{"from":2122.49,"to":2126.65,"location":2,"content":"the model actually does a pretty good job of sort of minimally changing the sentence,"},{"from":2126.65,"to":2129.68,"location":2,"content":"kind of preserving a lot of that sentence's original semantics,"},{"from":2129.68,"to":2136.54,"location":2,"content":"um, such that the target attribute is changed."},{"from":2136.54,"to":2141.29,"location":2,"content":"Um, I also wanna throw a little bit of cold water on this idea."},{"from":2141.29,"to":2144.41,"location":2,"content":"So I do think it's really exciting and- and almost kind of"},{"from":2144.41,"to":2147.65,"location":2,"content":"mind-blowing that you can do this translation without labeled data."},{"from":2147.65,"to":2149.6,"location":2,"content":"Um, certainly, right."},{"from":2149.6,"to":2154.52,"location":2,"content":"It's really hard to imagine someone giving me a bunch of books in Italian and say, \"Okay."},{"from":2154.52,"to":2156.41,"location":2,"content":"We're in Italian,\" um, without, you know,"},{"from":2156.41,"to":2159.76,"location":2,"content":"teaching you how to specifically do the translation."},{"from":2159.76,"to":2163.95,"location":2,"content":"Um, but, um, even though these methods show promise,"},{"from":2163.95,"to":2168.09,"location":2,"content":"um, mostly they have shown promise on languages that are quite closely related."},{"from":2168.09,"to":2169.78,"location":2,"content":"So those previous results,"},{"from":2169.78,"to":2171.07,"location":2,"content":"those were all, um,"},{"from":2171.07,"to":2173.75,"location":2,"content":"some combination of English to French or English to German,"},{"from":2173.75,"to":2176.27,"location":2,"content":"um, or so on, and those languages are quite similar."},{"from":2176.27,"to":2178.28,"location":2,"content":"[NOISE] Um, so if you look at, uh,"},{"from":2178.28,"to":2180.32,"location":2,"content":"a different language pair, let's say English to Turkish,"},{"from":2180.32,"to":2184.68,"location":2,"content":"where, um, the linguistics in those two languages are quite different, uh,"},{"from":2184.68,"to":2187.61,"location":2,"content":"these methods do still work to some extent, um,"},{"from":2187.61,"to":2190.91,"location":2,"content":"so they get around five BLEU points let's say, uh,"},{"from":2190.91,"to":2193.19,"location":2,"content":"but they don't work nearly as well,"},{"from":2193.19,"to":2195.89,"location":2,"content":"um, as they do in the f- uh, i- in the other settings, right?"},{"from":2195.89,"to":2200.24,"location":2,"content":"So there's still a huge gap to purely supervised learning. Um, right?"},{"from":2200.24,"to":2201.35,"location":2,"content":"So we're probably not, you know,"},{"from":2201.35,"to":2205.04,"location":2,"content":"quite at this stage where an alien could come down and it's sort of, no problem,"},{"from":2205.04,"to":2208.22,"location":2,"content":"let's use our unsupervised machine translation system, um,"},{"from":2208.22,"to":2212.4,"location":2,"content":"but I still think that's pretty exciting progress. Um, yeah. Question?"},{"from":2212.4,"to":2215.27,"location":2,"content":"Um, so what you're saying is that the genealogy of"},{"from":2215.27,"to":2218.63,"location":2,"content":"a language might need it to superimpose worse, right?"},{"from":2218.63,"to":2221.51,"location":2,"content":"Because my original thought was that if you took, for example,"},{"from":2221.51,"to":2224.81,"location":2,"content":"like Latin, which doesn't have a word for, you know,"},{"from":2224.81,"to":2231.44,"location":2,"content":"the modern classification of car, I thought that would do more poorly. But if- but, uh, basically,"},{"from":2231.44,"to":2235.28,"location":2,"content":"what I'm asking is, do you think the English maps better to Latin"},{"from":2235.28,"to":2240.38,"location":2,"content":"because they're both related, and worse to Turkish or is it the other way around?"},{"from":2240.38,"to":2245.64,"location":2,"content":"Um, I would expect English to map quite a lot better to Latin."},{"from":2245.64,"to":2248.93,"location":2,"content":"And I think part of the issue here is that, um,"},{"from":2248.93,"to":2253.46,"location":2,"content":"the difficulty in translation I think is not really at the word level."},{"from":2253.46,"to":2255.41,"location":2,"content":"So I mean that certainly is an issue that words exist"},{"from":2255.41,"to":2257.49,"location":2,"content":"in one language that don't exist in another,"},{"from":2257.49,"to":2258.74,"location":2,"content":"um, but I think actually,"},{"from":2258.74,"to":2263.2,"location":2,"content":"more substantial differences between language is at the level of like syntax,"},{"from":2263.2,"to":2265.82,"location":2,"content":"um, um, or you know, semantics, right?"},{"from":2265.82,"to":2267.41,"location":2,"content":"How ideas are expressed."},{"from":2267.41,"to":2273.51,"location":2,"content":"Um, so- so I think I- I would expect Ital- Latin to have, you know,"},{"from":2273.51,"to":2276.02,"location":2,"content":"relatively similar syntax to English,"},{"from":2276.02,"to":2277.58,"location":2,"content":"um, compared to say Turkish,"},{"from":2277.58,"to":2279.86,"location":2,"content":"I imagine that is probably the bigger obstacle"},{"from":2279.86,"to":2287.19,"location":2,"content":"for unsupervised machine translation models."},{"from":2287.19,"to":2290.26,"location":2,"content":"Um, I'm going to really quickly go into"},{"from":2290.26,"to":2294.91,"location":2,"content":"this last recent research paper which is basically taking BERT which,"},{"from":2294.91,"to":2297.26,"location":2,"content":"which you've learned about, um, correct?"},{"from":2297.26,"to":2300.19,"location":2,"content":"Yes. Okay. And making it cross-lingual."},{"from":2300.19,"to":2303.73,"location":2,"content":"Um, so, um, here's what regular BERT is, right?"},{"from":2303.73,"to":2306.3,"location":2,"content":"We have a sequence of sentences in English."},{"from":2306.3,"to":2308.22,"location":2,"content":"We're going to mask out some of the words."},{"from":2308.22,"to":2311.5,"location":2,"content":"And we're going to ask BERT which is our transformer model, um,"},{"from":2311.5,"to":2316.68,"location":2,"content":"to essentially fill in the blanks and predict what were the words that were dropped out."},{"from":2316.68,"to":2322.99,"location":2,"content":"Um, what actually has already been done by Google is training a multilingual BERT ."},{"from":2322.99,"to":2326.84,"location":2,"content":"So what they did essentially is concatenate, um,"},{"from":2326.84,"to":2331.56,"location":2,"content":"a whole bunch of corpora in different languages and then train one model um,"},{"from":2331.56,"to":2334.78,"location":2,"content":"doing using this masked LM objective um,"},{"from":2334.78,"to":2336.32,"location":2,"content":"on all of that text at once."},{"from":2336.32,"to":2338.3,"location":2,"content":"And that's a publicly released model."},{"from":2338.3,"to":2343.49,"location":2,"content":"Um, the, the new kind of extension to this that has recently been uh,"},{"from":2343.49,"to":2346.3,"location":2,"content":"proposed by Facebook is to actually combine"},{"from":2346.3,"to":2350.97,"location":2,"content":"this masked LM training objective um, with uh, translation."},{"from":2350.97,"to":2357.13,"location":2,"content":"So what they do is sometimes give this model a in this case,"},{"from":2357.13,"to":2361.06,"location":2,"content":"a sequence in English and a sequence in uh, French."},{"from":2361.06,"to":2364.3,"location":2,"content":"Um, drop out some of the words and just as before,"},{"from":2364.3,"to":2366.13,"location":2,"content":"ask the model to fill it in."},{"from":2366.13,"to":2368.64,"location":2,"content":"And the motivation here is that, um,"},{"from":2368.64,"to":2371.08,"location":2,"content":"this will much better cause the model"},{"from":2371.08,"to":2373.53,"location":2,"content":"to understand the relation between these two languages."},{"from":2373.53,"to":2377.95,"location":2,"content":"Because if you're trying to find a fill in a English word that's been dropped,"},{"from":2377.95,"to":2380.5,"location":2,"content":"uh, the best way to do it if you have a translation is look"},{"from":2380.5,"to":2383.01,"location":2,"content":"at the French side and try to find that word."},{"from":2383.01,"to":2385.07,"location":2,"content":"Hopefully, that one hasn't been dropped as well."},{"from":2385.07,"to":2388.53,"location":2,"content":"And then you can um, much more easily fill in the blank."},{"from":2388.53,"to":2392.57,"location":2,"content":"And uh, this actually leads to very uh,"},{"from":2392.57,"to":2395.86,"location":2,"content":"substantial improvements in unsupervised machine translation."},{"from":2395.86,"to":2399.67,"location":2,"content":"So just like BERT is used for other tasks in NLP,"},{"from":2399.67,"to":2402.01,"location":2,"content":"they basically take this cross-lingual BERT."},{"from":2402.01,"to":2403.93,"location":2,"content":"They use it as initialization for"},{"from":2403.93,"to":2407.41,"location":2,"content":"a unsupervised machine translation system and they get, you know,"},{"from":2407.41,"to":2410.43,"location":2,"content":"really large gains on the order of 10 BLEU points um,"},{"from":2410.43,"to":2412.69,"location":2,"content":"such that the gap between"},{"from":2412.69,"to":2416.49,"location":2,"content":"unsupervised machine translation and the current supervised state of the art,"},{"from":2416.49,"to":2418.42,"location":2,"content":"um, is much smaller."},{"from":2418.42,"to":2423.19,"location":2,"content":"Uh, so this is a pretty recent idea but I think it also shows promise"},{"from":2423.19,"to":2428.32,"location":2,"content":"in really improving the quality of translation through using unlabeled data."},{"from":2428.32,"to":2430.95,"location":2,"content":"Um, although I guess yeah, I guess in this case with BERT"},{"from":2430.95,"to":2433.93,"location":2,"content":"they are using labeled translation data as well."},{"from":2433.93,"to":2437.82,"location":2,"content":"Any, any questions about this?"},{"from":2437.82,"to":2446.35,"location":2,"content":"Okay. Um, so that is all I'm going to say about using unlabeled data for translation."},{"from":2446.35,"to":2448.75,"location":2,"content":"The next part of this talk is about um,"},{"from":2448.75,"to":2454.72,"location":2,"content":"what happens if we really scale up these unsupervised language models."},{"from":2454.72,"to":2459.86,"location":2,"content":"Um, so in particular I'm gonna talk about GPT-2 which is a new model by OpenAI."},{"from":2459.86,"to":2462.26,"location":2,"content":"That's essentially a really giant language model"},{"from":2462.26,"to":2465.68,"location":2,"content":"and I think it has some interesting implications."},{"from":2465.68,"to":2475.06,"location":2,"content":"So first of all, here's just the sizes of a bunch of different NLP models and,"},{"from":2475.06,"to":2478.16,"location":2,"content":"um, you know, maybe a couple years ago the,"},{"from":2478.16,"to":2479.23,"location":2,"content":"the standard sort of"},{"from":2479.23,"to":2484.14,"location":2,"content":"LSTM medium-size model was on the order of about 10 million parameters."},{"from":2484.14,"to":2490.66,"location":2,"content":"Where 10- where a parameter is just a single weight let's say in the neural net um,"},{"from":2490.66,"to":2493.09,"location":2,"content":"ELMo and uh, GPT."},{"from":2493.09,"to":2495.52,"location":2,"content":"So the original OpenAI paper before they did"},{"from":2495.52,"to":2498.82,"location":2,"content":"this GPT-2 and we're about 10 times bigger than that."},{"from":2498.82,"to":2504.12,"location":2,"content":"Um, GPT-2 is about another order of magnitude bigger."},{"from":2504.12,"to":2508.82,"location":2,"content":"Um, one kind of interesting comparison point here is that uh,"},{"from":2508.82,"to":2511.74,"location":2,"content":"GPT-2 which is 1.5 billion parameters,"},{"from":2511.74,"to":2515.64,"location":2,"content":"actually has more parameters than a honey bee brain has synapses."},{"from":2515.64,"to":2518.44,"location":2,"content":"Um, so that sounds kind of impressive, right?"},{"from":2518.44,"to":2521.35,"location":2,"content":"You know honeybees are not the smartest of"},{"from":2521.35,"to":2525.32,"location":2,"content":"animals but they can still fly around and find nectar or whatever."},{"from":2525.32,"to":2528.76,"location":2,"content":"Um, but yeah. Of course, this isn't really an apples to apples comparison, right?"},{"from":2528.76,"to":2531.97,"location":2,"content":"So a synapse and a weight in a neural net are really quite different."},{"from":2531.97,"to":2534.49,"location":2,"content":"But I just think it's one kind of interesting milestone"},{"from":2534.49,"to":2536.82,"location":2,"content":"let's say in terms of model size um,"},{"from":2536.82,"to":2538.15,"location":2,"content":"that has been surpassed."},{"from":2538.15,"to":2546.84,"location":2,"content":"[NOISE] Um, one thing to point out here is that um,"},{"from":2546.84,"to":2552.13,"location":2,"content":"this increasing scaling of deep learning is really a general trend uh,"},{"from":2552.13,"to":2554.84,"location":2,"content":"in all of machine learning so beyond NLP."},{"from":2554.84,"to":2561.76,"location":2,"content":"So this plot is showing time on the x-axis and the y-axis is log scaled um,"},{"from":2561.76,"to":2565.26,"location":2,"content":"the amount of petaFLOPS used to train this model."},{"from":2565.26,"to":2570.01,"location":2,"content":"Um, so what this means is that the trend at least currently is that there is"},{"from":2570.01,"to":2573.13,"location":2,"content":"exponential growth in how much compute power"},{"from":2573.13,"to":2575.74,"location":2,"content":"we're throwing at our machine learning models."},{"from":2575.74,"to":2577.92,"location":2,"content":"I guess it is kind of unclear, you know,"},{"from":2577.92,"to":2580.7,"location":2,"content":"will exponential growth continue but certainly um,"},{"from":2580.7,"to":2583.56,"location":2,"content":"there's rapid growth in the size of our models."},{"from":2583.56,"to":2586.2,"location":2,"content":"And it's leading to some really amazing results, right?"},{"from":2586.2,"to":2589.45,"location":2,"content":"So here are results not from language but for vision."},{"from":2589.45,"to":2593.16,"location":2,"content":"Um, this is a generative adversarial network"},{"from":2593.16,"to":2596.92,"location":2,"content":"that's been trained on a lot of data and it's been trained on really large scales."},{"from":2596.92,"to":2602.71,"location":2,"content":"So it's a big model kind of in-between the size of ELMo and BERT let's say."},{"from":2602.71,"to":2607.51,"location":2,"content":"And uh, these photos here are actually productions of the model."},{"from":2607.51,"to":2608.74,"location":2,"content":"So those aren't real photos."},{"from":2608.74,"to":2611.51,"location":2,"content":"Those are things the model has just kind of hallucinated out of thin air."},{"from":2611.51,"to":2614.77,"location":2,"content":"And at least to me they look essentially photo-realistic."},{"from":2614.77,"to":2618.01,"location":2,"content":"There's also a website that um, is fun to look at it."},{"from":2618.01,"to":2619.91,"location":2,"content":"If you're not- if you're interested which is,"},{"from":2619.91,"to":2622.2,"location":2,"content":"thispersondoesnotexist.com."},{"from":2622.2,"to":2623.95,"location":2,"content":"So if you go there, you'll see"},{"from":2623.95,"to":2627.43,"location":2,"content":"a very convincing photo of a person but it's not a real photo."},{"from":2627.43,"to":2631.44,"location":2,"content":"It's again like a hallucinated image produced by a GAN."},{"from":2631.44,"to":2635.72,"location":2,"content":"We're also seeing really huge models being used for image recognition."},{"from":2635.72,"to":2638.11,"location":2,"content":"So this is recent work by Google where they trained"},{"from":2638.11,"to":2642.01,"location":2,"content":"an image net model with half a billion parameters."},{"from":2642.01,"to":2646.45,"location":2,"content":"So that's bigger than BERT but not as big as GPT-2."},{"from":2646.45,"to":2649.42,"location":2,"content":"Um, this plot here is showing a"},{"from":2649.42,"to":2654.76,"location":2,"content":"log scaled number of parameters on the x-axis and then accuracy at ImageNet"},{"from":2654.76,"to":2660.52,"location":2,"content":"on the y-axis- axis and sort of unsurprisingly bigger models perform better."},{"from":2660.52,"to":2664,"location":2,"content":"And there seems to actually be a pretty consistent trend here which is uh,"},{"from":2664,"to":2671.01,"location":2,"content":"accuracy is increasing with the log of the, the model size."},{"from":2671.01,"to":2675.1,"location":2,"content":"Um, I wanna go into a little bit more detail, how is it"},{"from":2675.1,"to":2679.06,"location":2,"content":"possible that we can scale up models and train models at such a large extent."},{"from":2679.06,"to":2681.19,"location":2,"content":"One answer is just better hardware."},{"from":2681.19,"to":2682.68,"location":2,"content":"And in particular, um,"},{"from":2682.68,"to":2684.16,"location":2,"content":"there's a growing uh,"},{"from":2684.16,"to":2688.16,"location":2,"content":"number of companies that are developing hardware specifically for deep learning."},{"from":2688.16,"to":2690.52,"location":2,"content":"So these are even more kind of constrained and the"},{"from":2690.52,"to":2693.19,"location":2,"content":"kind of operations they can do than a GPU,"},{"from":2693.19,"to":2695.95,"location":2,"content":"um but they do those operations even faster."},{"from":2695.95,"to":2699.61,"location":2,"content":"So Google's Tensor Processing Units is one example."},{"from":2699.61,"to":2703.18,"location":2,"content":"There are actually a bunch of other companies working on this idea."},{"from":2703.18,"to":2706.93,"location":2,"content":"Um, the other way to scale up models is by taking advantage of"},{"from":2706.93,"to":2711.84,"location":2,"content":"parallelism and there's two kinds of parallelism that I want to talk about very briefly."},{"from":2711.84,"to":2713.98,"location":2,"content":"So one is data parallelism."},{"from":2713.98,"to":2716.78,"location":2,"content":"In this case, each of your,"},{"from":2716.78,"to":2719.38,"location":2,"content":"let's say GPUs, will have a copy of the model."},{"from":2719.38,"to":2721.48,"location":2,"content":"And what you essentially do is split"},{"from":2721.48,"to":2725.35,"location":2,"content":"the mini-batch that you're training on across these different models."},{"from":2725.35,"to":2727.16,"location":2,"content":"So if you have, let's say,"},{"from":2727.16,"to":2730.95,"location":2,"content":"16 GPUs and each of them see a batch size of 32."},{"from":2730.95,"to":2735.67,"location":2,"content":"You can aggregate the gradients of these 16 uh, uh,"},{"from":2735.67,"to":2742.54,"location":2,"content":"if you do a back-prop on these 16 GPUs and you end up with effectively a batch size of 512."},{"from":2742.54,"to":2744.7,"location":2,"content":"So this allows you to train models much faster."},{"from":2744.7,"to":2750.34,"location":2,"content":"Um, the other kind of parallelism that's growing in importance is model par- parallelism."},{"from":2750.34,"to":2754.51,"location":2,"content":"Um, so eventually models get so big that they"},{"from":2754.51,"to":2759.07,"location":2,"content":"can't even fit on a single GPU and they can't even do a batch size of one."},{"from":2759.07,"to":2760.66,"location":2,"content":"Um, in this case,"},{"from":2760.66,"to":2762.99,"location":2,"content":"you actually need to split up the model across"},{"from":2762.99,"to":2766.07,"location":2,"content":"multiple computers- multiple compute units."},{"from":2766.07,"to":2770.49,"location":2,"content":"Um, and that's what's done for models kind of the size of,"},{"from":2770.49,"to":2772.72,"location":2,"content":"of let's say GPT-2."},{"from":2772.72,"to":2775.54,"location":2,"content":"There are new frameworks such as Mesh-TensorFlow, um,"},{"from":2775.54,"to":2783.99,"location":2,"content":"which are basically designed to make this sort of model parallelism easier."},{"from":2783.99,"to":2787.39,"location":2,"content":"Um, okay. So onto GPT-2, um,"},{"from":2787.39,"to":2791.56,"location":2,"content":"I know you already saw this a little bit in the contextualized uh,"},{"from":2791.56,"to":2796.54,"location":2,"content":"um, embeddings um, lecture but I'm going to go into some more depth here."},{"from":2796.54,"to":2801.26,"location":2,"content":"[NOISE] So so essentially it's a really large transformer language model."},{"from":2801.26,"to":2805.16,"location":2,"content":"Um, so there's nothing really kind of novel here in terms"},{"from":2805.16,"to":2809.3,"location":2,"content":"of new training algorithms or in terms of um,"},{"from":2809.3,"to":2811.64,"location":2,"content":"the loss function or anything like that."},{"from":2811.64,"to":2813.34,"location":2,"content":"Um, the thing that makes it different from"},{"from":2813.34,"to":2816.07,"location":2,"content":"prior work is that it's just really really big."},{"from":2816.07,"to":2819.97,"location":2,"content":"Uh, it's trained on a correspondingly huge amount of text."},{"from":2819.97,"to":2824.8,"location":2,"content":"So it's trained on 40 gigabytes and that's roughly 10 times larger than previous uh,"},{"from":2824.8,"to":2827.22,"location":2,"content":"language models have been trained on."},{"from":2827.22,"to":2831.07,"location":2,"content":"Um, when you have that size of dataset,"},{"from":2831.07,"to":2834.32,"location":2,"content":"um, the only way to get that much text is essentially to go to the web."},{"from":2834.32,"to":2838.84,"location":2,"content":"Um, so one thing OpenAI put a quite a bit of effort into when they're developing"},{"from":2838.84,"to":2843.57,"location":2,"content":"this network was to ensure that that text was pretty high-quality."},{"from":2843.57,"to":2846.18,"location":2,"content":"Um, and they did that in a kind of interesting way."},{"from":2846.18,"to":2848.97,"location":2,"content":"They, they looked at Reddit which is this website where people uh,"},{"from":2848.97,"to":2850.14,"location":2,"content":"can vote on links."},{"from":2850.14,"to":2851.64,"location":2,"content":"And then they said uh, if"},{"from":2851.64,"to":2855.09,"location":2,"content":"a link has a lot of votes then it's probably sort of a decent link."},{"from":2855.09,"to":2856.83,"location":2,"content":"There's probably um, you know,"},{"from":2856.83,"to":2860.61,"location":2,"content":"reasonable text there for a model to learn."},{"from":2860.61,"to":2863.08,"location":2,"content":"Um, okay, so if we have"},{"from":2863.08,"to":2865.6,"location":2,"content":"this super huge language model like"},{"from":2865.6,"to":2869.51,"location":2,"content":"GPT-2 on this question of what can you actually do with it,"},{"from":2869.51,"to":2873.41,"location":2,"content":"um, well obviously if you have a language model you can do language modelling with it."},{"from":2873.41,"to":2876.79,"location":2,"content":"Uh, but one thing kind of interestingly interesting is that you"},{"from":2876.79,"to":2880.53,"location":2,"content":"can run this language model on er,"},{"from":2880.53,"to":2883.43,"location":2,"content":"existing benchmarks, um, for,"},{"from":2883.43,"to":2885.25,"location":2,"content":"for language modelling, um,"},{"from":2885.25,"to":2888.52,"location":2,"content":"and it gets state of the art perplexity on these benchmarks even"},{"from":2888.52,"to":2891.7,"location":2,"content":"though it never sees the training data for these benchmarks, right?"},{"from":2891.7,"to":2896.77,"location":2,"content":"So normally, if you want to say evaluate your language model on the Penn Treebank."},{"from":2896.77,"to":2901.51,"location":2,"content":"You first train on the Penn Treebank and then you evaluate on this held-out set."},{"from":2901.51,"to":2903.79,"location":2,"content":"Uh, in this case, uh,"},{"from":2903.79,"to":2908.51,"location":2,"content":"a GPT-2 just by virtue of having seen so much text and being such a large model,"},{"from":2908.51,"to":2911.09,"location":2,"content":"outperforms all these other uh,"},{"from":2911.09,"to":2914.58,"location":2,"content":"prior works even though it's not seeing that data."},{"from":2914.58,"to":2920.8,"location":2,"content":"Um, on a bunch of different uh, language modelling benchmarks."},{"from":2920.8,"to":2926.32,"location":2,"content":"Um, but there's a bunch of other interesting experiments that OpenAI"},{"from":2926.32,"to":2931.7,"location":2,"content":"ran with this language modeling and these were based on zero-shot learning."},{"from":2931.7,"to":2937.25,"location":2,"content":"So zero-shot learning just means trying to do a task without ever training on it."},{"from":2937.25,"to":2940.45,"location":2,"content":"And, uh, the way you can do this with a language model"},{"from":2940.45,"to":2943.46,"location":2,"content":"is by designing a prompt you feed into"},{"from":2943.46,"to":2946.88,"location":2,"content":"the language model and then have it just generate from there and"},{"from":2946.88,"to":2951.07,"location":2,"content":"hopefully it generates something relevant to the task you're trying to solve."},{"from":2951.07,"to":2953.22,"location":2,"content":"So for example, for reading comprehension,"},{"from":2953.22,"to":2956.09,"location":2,"content":"what you can do is take the context paragraph,"},{"from":2956.09,"to":2960.08,"location":2,"content":"uh, concatenate the question to it and then add uh,"},{"from":2960.08,"to":2961.43,"location":2,"content":"a colon which is a way,"},{"from":2961.43,"to":2962.7,"location":2,"content":"I guess, of telling the model,"},{"from":2962.7,"to":2965.21,"location":2,"content":"''Okay you should be producing an answer to this question,''"},{"from":2965.21,"to":2967.79,"location":2,"content":"and then just have it generate text, um,"},{"from":2967.79,"to":2970.94,"location":2,"content":"and perhaps it'll generate something that is actually answering,"},{"from":2970.94,"to":2972.36,"location":2,"content":"um, the question and is,"},{"from":2972.36,"to":2974.06,"location":2,"content":"is paying attention to the context."},{"from":2974.06,"to":2977.39,"location":2,"content":"[NOISE] Um, and similarly, for summarization,"},{"from":2977.39,"to":2981.74,"location":2,"content":"you can get the article then TL;DR and perhaps the model will produce the summary."},{"from":2981.74,"to":2983.8,"location":2,"content":"Um, you can even do translation,"},{"from":2983.8,"to":2985.66,"location":2,"content":"where you give the model,"},{"from":2985.66,"to":2989.72,"location":2,"content":"um, some ex- a list of known English to French translations so you, sort of,"},{"from":2989.72,"to":2993.77,"location":2,"content":"prime it to tell it that it should be doing translation and then you give"},{"from":2993.77,"to":2998.12,"location":2,"content":"it the source sequence equals blank and have it just run and,"},{"from":2998.12,"to":2999.92,"location":2,"content":"um, perhaps it'll generate,"},{"from":2999.92,"to":3003.3,"location":2,"content":"um, the sequence in the target language."},{"from":3003.3,"to":3006.89,"location":2,"content":"Um, okay. So so here's what the results look like."},{"from":3006.89,"to":3009.1,"location":2,"content":"Um, for all of these,"},{"from":3009.1,"to":3011.55,"location":2,"content":"uh, the X-axis is,"},{"from":3011.55,"to":3016.21,"location":2,"content":"is log scaled model size and the Y-axis is accuracy, um,"},{"from":3016.21,"to":3018.72,"location":2,"content":"and the dotted lines basically correspond to,"},{"from":3018.72,"to":3022.09,"location":2,"content":"um, existing works on these tasks."},{"from":3022.09,"to":3026.29,"location":2,"content":"Um, so for most of these tasks, um,"},{"from":3026.29,"to":3031.76,"location":2,"content":"GPT-2 is quite a bit below existing systems,"},{"from":3031.76,"to":3033.63,"location":2,"content":"um, but there's of course this big difference, right?"},{"from":3033.63,"to":3037.2,"location":2,"content":"Existing systems are trained specifically to do,"},{"from":3037.2,"to":3039.78,"location":2,"content":"um, whatever task they're being evaluated on,"},{"from":3039.78,"to":3042.52,"location":2,"content":"where GPT-2 is um,"},{"from":3042.52,"to":3046.54,"location":2,"content":"only trained to do language modeling and as it learns language modeling,"},{"from":3046.54,"to":3048.86,"location":2,"content":"it's sort of picking up on these other tasks."},{"from":3048.86,"to":3050.78,"location":2,"content":"Um, so right. So for example, um,"},{"from":3050.78,"to":3054.39,"location":2,"content":"it does, uh, English to French machine translation, um,"},{"from":3054.39,"to":3056.88,"location":2,"content":"not as well as, uh,"},{"from":3056.88,"to":3060.4,"location":2,"content":"standard unsupervised machine translation which is those, uh,"},{"from":3060.4,"to":3062.92,"location":2,"content":"dotted lines, um, but it still,"},{"from":3062.92,"to":3064.3,"location":2,"content":"it still does quite well."},{"from":3064.3,"to":3066.37,"location":2,"content":"And, um, one thing, kind of,"},{"from":3066.37,"to":3067.81,"location":2,"content":"interesting is the trend line, right,"},{"from":3067.81,"to":3069.52,"location":2,"content":"for almost all of these tasks."},{"from":3069.52,"to":3071.53,"location":2,"content":"Um, performance is getting uh,"},{"from":3071.53,"to":3073.6,"location":2,"content":"much better as the model increases in size."},{"from":3073.6,"to":3078.53,"location":2,"content":"[NOISE] Um, I think a particularly interesting,"},{"from":3078.53,"to":3081.58,"location":2,"content":"uh, one of these tasks is machine translation, right?"},{"from":3081.58,"to":3083.29,"location":2,"content":"So the question is, how can it be doing"},{"from":3083.29,"to":3086.44,"location":2,"content":"machine translation when all we're giving it as a bunch of"},{"from":3086.44,"to":3088.54,"location":2,"content":"web pages and those web pages are almost all in"},{"from":3088.54,"to":3091.81,"location":2,"content":"English and yet somehow it sort of magically picks up uh,"},{"from":3091.81,"to":3093.34,"location":2,"content":"a little bit of machine translation, right."},{"from":3093.34,"to":3095.39,"location":2,"content":"So it's not a great model but it can still,"},{"from":3095.39,"to":3098.26,"location":2,"content":"um, you know, do a decent job in some cases."},{"from":3098.26,"to":3100.51,"location":2,"content":"Um, and the answer is that,"},{"from":3100.51,"to":3103.81,"location":2,"content":"if you look at this giant corpus of English,"},{"from":3103.81,"to":3107.05,"location":2,"content":"occasionally, uh, within, within that corpus,"},{"from":3107.05,"to":3108.88,"location":2,"content":"you see examples of translations, right?"},{"from":3108.88,"to":3110.29,"location":2,"content":"So you see, um,"},{"from":3110.29,"to":3112.81,"location":2,"content":"a French idiom and its translation or"},{"from":3112.81,"to":3116.03,"location":2,"content":"a quote from someone who's French and then the translation in English."},{"from":3116.03,"to":3117.4,"location":2,"content":"And, um, kind of,"},{"from":3117.4,"to":3120.7,"location":2,"content":"amazingly I think this big model, um,"},{"from":3120.7,"to":3125.38,"location":2,"content":"sees enough of these examples that it actually starts to learn how to generate French,"},{"from":3125.38,"to":3127.03,"location":2,"content":"um, even though that wasn't really,"},{"from":3127.03,"to":3131.97,"location":2,"content":"sort of, an intended part of its training."},{"from":3131.97,"to":3134.56,"location":2,"content":"Um, another interesting, um,"},{"from":3134.56,"to":3138.7,"location":2,"content":"thing to dig a bit more into is its ability to do question answering."},{"from":3138.7,"to":3144.04,"location":2,"content":"So uh, a simple baseline for question answering gets about 1% accuracy,"},{"from":3144.04,"to":3147.3,"location":2,"content":"GPT-2 barely does better at 4% accuracy."},{"from":3147.3,"to":3148.84,"location":2,"content":"So this isn't, like, you know,"},{"from":3148.84,"to":3152.44,"location":2,"content":"super amazingly solved question answering, um, but, um,"},{"from":3152.44,"to":3154.42,"location":2,"content":"it's still pretty interesting in that,"},{"from":3154.42,"to":3157.43,"location":2,"content":"if you look at answers the model's most confident about,"},{"from":3157.43,"to":3159.01,"location":2,"content":"you can see that it sort of"},{"from":3159.01,"to":3161.32,"location":2,"content":"has learned some facts about the world, right."},{"from":3161.32,"to":3165.55,"location":2,"content":"So it's learned that Charles Darwin wrote Origin of Species."},{"from":3165.55,"to":3170.74,"location":2,"content":"Um, normally in the history of NLP, if you want to get, kind of,"},{"from":3170.74,"to":3172.76,"location":2,"content":"world knowledge into an NLP system,"},{"from":3172.76,"to":3175.43,"location":2,"content":"you'd need something like a big database of facts."},{"from":3175.43,"to":3177.34,"location":2,"content":"And even though this is still,"},{"from":3177.34,"to":3179.5,"location":2,"content":"kind of, very early stages and that, um,"},{"from":3179.5,"to":3184,"location":2,"content":"there's still a huge gap between 4% accuracy and the, uh, you know,"},{"from":3184,"to":3185.88,"location":2,"content":"70% or so that, uh,"},{"from":3185.88,"to":3189.55,"location":2,"content":"state of the art open domain question answering systems can do,"},{"from":3189.55,"to":3192.01,"location":2,"content":"um, it, it, um,"},{"from":3192.01,"to":3194.2,"location":2,"content":"it still can, uh,"},{"from":3194.2,"to":3197.74,"location":2,"content":"pick up some world knowledge just by reading a lot of text, um, without,"},{"from":3197.74,"to":3201.89,"location":2,"content":"kind of, explicitly having that knowledge put into the model."},{"from":3201.89,"to":3208.05,"location":2,"content":"Um, any questions by the way on GPT-2 so far?"},{"from":3208.05,"to":3213.86,"location":2,"content":"Okay. So one question that's interesting to think about is,"},{"from":3213.86,"to":3216.51,"location":2,"content":"what happens if our models get even bigger?"},{"from":3216.51,"to":3218.3,"location":2,"content":"Um, so here I've done the, um,"},{"from":3218.3,"to":3222.57,"location":2,"content":"very scientific thing of drawing some lines in PowerPoint and seeing where they meet up."},{"from":3222.57,"to":3224.66,"location":2,"content":"Um, and you can see that, um,"},{"from":3224.66,"to":3228.43,"location":2,"content":"if the trend holds at about 1 trillion parameters,"},{"from":3228.43,"to":3232.39,"location":2,"content":"um, we get to human level reading comprehension performance."},{"from":3232.39,"to":3235.48,"location":2,"content":"Um, so if that's true it would be really astonishing."},{"from":3235.48,"to":3240.51,"location":2,"content":"I actually do expect that a 1 trillion parameter model would be attainable in,"},{"from":3240.51,"to":3242.16,"location":2,"content":"I don't know, ten years or so,"},{"from":3242.16,"to":3244.24,"location":2,"content":"um, but of course,"},{"from":3244.24,"to":3245.66,"location":2,"content":"right, the trend isn't clear."},{"from":3245.66,"to":3247.63,"location":2,"content":"So if you look at summarization for example,"},{"from":3247.63,"to":3249.04,"location":2,"content":"it seems like performance is already,"},{"from":3249.04,"to":3251.01,"location":2,"content":"uh, uh, topped out."},{"from":3251.01,"to":3255.76,"location":2,"content":"Um, so I think this will be a really interesting thing kinda going forward,"},{"from":3255.76,"to":3257.98,"location":2,"content":"looking at the future of NLP, um,"},{"from":3257.98,"to":3260.71,"location":2,"content":"is how the scaling will change,"},{"from":3260.71,"to":3264.12,"location":2,"content":"um, the way NLP is approached."},{"from":3264.12,"to":3269.76,"location":2,"content":"Um, the other interesting thing about GPT-2 was its reaction from uh,"},{"from":3269.76,"to":3272.13,"location":2,"content":"the media and also from other researchers."},{"from":3272.13,"to":3275.45,"location":2,"content":"Um, and the real cause of"},{"from":3275.45,"to":3279.3,"location":2,"content":"a lot of the controversy about it was this statement from OpenAI."},{"from":3279.3,"to":3283,"location":2,"content":"They said that, ''We're not going to release our full language model,"},{"from":3283,"to":3284.59,"location":2,"content":"um, because it's too dangerous,"},{"from":3284.59,"to":3286.01,"location":2,"content":"you know, our language model is too good.''"},{"from":3286.01,"to":3291.11,"location":2,"content":"Um, so the media really enjoyed this and,"},{"from":3291.11,"to":3292.33,"location":2,"content":"you know, said that,"},{"from":3292.33,"to":3295.14,"location":2,"content":"uh, machine learning is going to break the Internet."},{"from":3295.14,"to":3300.58,"location":2,"content":"Um, there's also some pretty interesting reactions from our researchers, right."},{"from":3300.58,"to":3302.02,"location":2,"content":"So um, there's some,"},{"from":3302.02,"to":3304.2,"location":2,"content":"kind of, tongue-in-cheek responses here, right."},{"from":3304.2,"to":3305.76,"location":2,"content":"You know, I trained the model on MNIST."},{"from":3305.76,"to":3307.91,"location":2,"content":"Is it too dangerous for me to release it?"},{"from":3307.91,"to":3311.53,"location":2,"content":"Um, and similarly, we've done really great work"},{"from":3311.53,"to":3315.72,"location":2,"content":"but we can't release it it's too dangerous so you're just gonna have to trust us on this."},{"from":3315.72,"to":3318.97,"location":2,"content":"Looking at more, kind of, reasoned, um,"},{"from":3318.97,"to":3320.66,"location":2,"content":"debate about this issue,"},{"from":3320.66,"to":3322.89,"location":2,"content":"you still see articles,"},{"from":3322.89,"to":3324.61,"location":2,"content":"um, arguing both sides."},{"from":3324.61,"to":3326.47,"location":2,"content":"So these are two ar- articles,"},{"from":3326.47,"to":3329.55,"location":2,"content":"um, from The Gradient which is a, sort of,"},{"from":3329.55,"to":3331.69,"location":2,"content":"machine learning newsletter, um,"},{"from":3331.69,"to":3335.88,"location":2,"content":"and they're arguing precisely opposite sides of this issue,"},{"from":3335.88,"to":3340.77,"location":2,"content":"um, should it be released or not."},{"from":3340.77,"to":3347.13,"location":2,"content":"So I guess I can briefly go over a few arguments for or against."},{"from":3347.13,"to":3350.18,"location":2,"content":"There is, kind of, a lot of debate about this and I don't want to"},{"from":3350.18,"to":3354.15,"location":2,"content":"go too deep into a controversial issue,"},{"from":3354.15,"to":3356.71,"location":2,"content":"um, but here's a long list of,"},{"from":3356.71,"to":3358.57,"location":2,"content":"kind of, things people have said about this, right."},{"from":3358.57,"to":3361.45,"location":2,"content":"So um, here's why you should release."},{"from":3361.45,"to":3363.28,"location":2,"content":"One complaint is that,"},{"from":3363.28,"to":3365.07,"location":2,"content":"is this model really that special?"},{"from":3365.07,"to":3366.59,"location":2,"content":"There's nothing new going on here."},{"from":3366.59,"to":3369.64,"location":2,"content":"It's just 10 times bigger than previous models, um,"},{"from":3369.64,"to":3371.86,"location":2,"content":"and there's also some arguments that,"},{"from":3371.86,"to":3374.5,"location":2,"content":"um, even if this one isn't released, you know,"},{"from":3374.5,"to":3377.18,"location":2,"content":"in five years everybody can train a model this good, um,"},{"from":3377.18,"to":3382.27,"location":2,"content":"and actually if you look at image recognition or look at images and speech data, um,"},{"from":3382.27,"to":3385.78,"location":2,"content":"it already is possible to synthesize highly convincing,"},{"from":3385.78,"to":3388.41,"location":2,"content":"um, fake images and fake speech."},{"from":3388.41,"to":3394.75,"location":2,"content":"So kinda, what makes this thing different from those other, um, systems."},{"from":3394.75,"to":3396.31,"location":2,"content":"And speaking of other systems, right,"},{"from":3396.31,"to":3398.34,"location":2,"content":"Photoshop has existed for a long time,"},{"from":3398.34,"to":3401.95,"location":2,"content":"so we can already convincingly fake images, um,"},{"from":3401.95,"to":3404.14,"location":2,"content":"people have just learned to adjust and learned"},{"from":3404.14,"to":3406.64,"location":2,"content":"that you shouldn't always trust what's in an image,"},{"from":3406.64,"to":3407.99,"location":2,"content":"um, because it may have been,"},{"from":3407.99,"to":3410.07,"location":2,"content":"um, altered in some way."},{"from":3410.07,"to":3412.45,"location":2,"content":"Um, on the other hand, you could say,"},{"from":3412.45,"to":3415.78,"location":2,"content":"''Okay, uh, Photoshop exists but, um, you can't, sort of,"},{"from":3415.78,"to":3420.13,"location":2,"content":"scale up Photoshop and start mass producing fake content the way you can with this sort"},{"from":3420.13,"to":3424.66,"location":2,"content":"of model,'' and they pointed at the danger of uh, fake news, um,"},{"from":3424.66,"to":3428.95,"location":2,"content":"fake reviews, um, in general just astroturfing, which means basically,"},{"from":3428.95,"to":3435.37,"location":2,"content":"uh, creating fake user content that's supporting a view you want other people to hold."},{"from":3435.37,"to":3438.87,"location":2,"content":"Um, this is actually something that's already done,"},{"from":3438.87,"to":3441.66,"location":2,"content":"um, pretty widely by country- companies and governments."},{"from":3441.66,"to":3443.47,"location":2,"content":"There's a lot of evidence for this, um,"},{"from":3443.47,"to":3445.5,"location":2,"content":"but they are of course hiring people to"},{"from":3445.5,"to":3447.8,"location":2,"content":"write all these comments on news articles let's say"},{"from":3447.8,"to":3450.39,"location":2,"content":"and we don't want to make their job any easier"},{"from":3450.39,"to":3453.62,"location":2,"content":"by producing a machine that could potentially do this."},{"from":3453.62,"to":3457.33,"location":2,"content":"So um, I'm not really gonna take a side here,"},{"from":3457.33,"to":3459.57,"location":2,"content":"um, there's still a lot of debate about this."},{"from":3459.57,"to":3461.11,"location":2,"content":"I think, you know,"},{"from":3461.11,"to":3463.3,"location":2,"content":"the main, the main takeaway here is that,"},{"from":3463.3,"to":3466.96,"location":2,"content":"as a community on people in machine learning and NLP,"},{"from":3466.96,"to":3468.91,"location":2,"content":"don't really have a handle on this, right?"},{"from":3468.91,"to":3471.36,"location":2,"content":"We are sort of caught by surprise by, um,"},{"from":3471.36,"to":3476.09,"location":2,"content":"OpenAI's, um, decision here and, um, uh,"},{"from":3476.09,"to":3477.76,"location":2,"content":"that means that, you know,"},{"from":3477.76,"to":3481.12,"location":2,"content":"there really is some figuring out that needs to be done on what"},{"from":3481.12,"to":3485.51,"location":2,"content":"exactly is responsible to release publicly."},{"from":3485.51,"to":3489.43,"location":2,"content":"What kind of research problems should we be working on and so on."},{"from":3489.43,"to":3491.53,"location":2,"content":"[NOISE] So yeah."},{"from":3491.53,"to":3493.8,"location":2,"content":"Any questions about uh, this,"},{"from":3493.8,"to":3496.45,"location":2,"content":"this reaction or this debate in general?"},{"from":3496.45,"to":3502.14,"location":2,"content":"[NOISE] Okay."},{"from":3502.14,"to":3507.61,"location":2,"content":"Um, I think something arising from this debate is, um,"},{"from":3507.61,"to":3509.31,"location":2,"content":"the question of, um,"},{"from":3509.31,"to":3512.58,"location":2,"content":"should really the ML people be the people making these, sort of,"},{"from":3512.58,"to":3518.09,"location":2,"content":"decisions or is there a need for more interdisciplinary science where we look at, um,"},{"from":3518.09,"to":3520.43,"location":2,"content":"experts in say, computer security,"},{"from":3520.43,"to":3522.7,"location":2,"content":"um, people from social sciences,"},{"from":3522.7,"to":3526.18,"location":2,"content":"um, you know, people who are experts in ethics,"},{"from":3526.18,"to":3528.36,"location":2,"content":"um, to look at these decisions."},{"from":3528.36,"to":3534.59,"location":2,"content":"Um, right. So GPT-2 was definitely one example of where suddenly it seems like,"},{"from":3534.59,"to":3538.42,"location":2,"content":"um, our NLP technology has a lot of pitfalls, right."},{"from":3538.42,"to":3542.01,"location":2,"content":"Where they could be used in a malicious way or they could cause damage."},{"from":3542.01,"to":3545.72,"location":2,"content":"And I think this trend is only going to increase, um,"},{"from":3545.72,"to":3547.16,"location":2,"content":"if you look at, kind of,"},{"from":3547.16,"to":3550.54,"location":2,"content":"areas of NLP that people are working on, uh,"},{"from":3550.54,"to":3556.51,"location":2,"content":"increasingly people are working on really high stakes applications of NLP,"},{"from":3556.51,"to":3559.57,"location":2,"content":"um, and those often have really big, um,"},{"from":3559.57,"to":3565.98,"location":2,"content":"ramifications, especially if you think from the angle of bias and fairness."},{"from":3565.98,"to":3572.69,"location":2,"content":"Um, so, so let's go over a couple examples of this, um-"},{"from":3572.69,"to":3575.95,"location":2,"content":"Um, one- so some, some areas where,"},{"from":3575.95,"to":3577.88,"location":2,"content":"where this is happening is people are looking at,"},{"from":3577.88,"to":3580.05,"location":2,"content":"uh, NLP to look at judicial decisions."},{"from":3580.05,"to":3581.89,"location":2,"content":"So for example, should this person,"},{"from":3581.89,"to":3583.3,"location":2,"content":"uh, get bail or not?"},{"from":3583.3,"to":3585.21,"location":2,"content":"Um, for hiring decisions, right?"},{"from":3585.21,"to":3586.68,"location":2,"content":"So you look at someone's resume,"},{"from":3586.68,"to":3588,"location":2,"content":"you run NLP on it,"},{"from":3588,"to":3590.78,"location":2,"content":"and then you'd make a decision automatically,"},{"from":3590.78,"to":3593.13,"location":2,"content":"um, sh- should we throw out this resume or not?"},{"from":3593.13,"to":3596.85,"location":2,"content":"So do some, sort of, screening, um, grading tests."},{"from":3596.85,"to":3598.65,"location":2,"content":"Um, if you take the GRE, um,"},{"from":3598.65,"to":3600.82,"location":2,"content":"your, your tests will be graded by a machine."},{"from":3600.82,"to":3603.09,"location":2,"content":"Um, a person will also look at it, um,"},{"from":3603.09,"to":3605.3,"location":2,"content":"but nevertheless, um, that's, you know,"},{"from":3605.3,"to":3609.09,"location":2,"content":"a sometimes very impactful part of your life, um, when it's,"},{"from":3609.09,"to":3611.09,"location":2,"content":"when it's the tests that, um, inf- you know,"},{"from":3611.09,"to":3614.49,"location":2,"content":"affects your, um, acceptance into a school, let's say."},{"from":3614.49,"to":3617.26,"location":2,"content":"Um, so I think there is- are some,"},{"from":3617.26,"to":3620.79,"location":2,"content":"some good sides of using Machine Learning in these kinds of contexts."},{"from":3620.79,"to":3624.12,"location":2,"content":"So one is that we can pretty quickly evaluate,"},{"from":3624.12,"to":3626.99,"location":2,"content":"a machine learning system and search out."},{"from":3626.99,"to":3628.68,"location":2,"content":"Does it have some, kind of, bias,"},{"from":3628.68,"to":3631.35,"location":2,"content":"just by running it on a bunch of data and seeing what it does,"},{"from":3631.35,"to":3634.35,"location":2,"content":"and also perhaps even more importantly,"},{"from":3634.35,"to":3635.64,"location":2,"content":"um, we can fix this, kind of,"},{"from":3635.64,"to":3637.08,"location":2,"content":"problem if it arises, right?"},{"from":3637.08,"to":3642.24,"location":2,"content":"So, um, it's probably easier to fix a machine learning system that screens resumes,"},{"from":3642.24,"to":3644.73,"location":2,"content":"than it is to s- to fix having, you know,"},{"from":3644.73,"to":3648.3,"location":2,"content":"5,000 executives that are slightly sexist or something, right?"},{"from":3648.3,"to":3649.72,"location":2,"content":"So, so in this way,"},{"from":3649.72,"to":3651.18,"location":2,"content":"um, there is a, sort of,"},{"from":3651.18,"to":3657.84,"location":2,"content":"positive angle on using machine learning in these high-stakes, um, uh, decisions."},{"from":3657.84,"to":3660.01,"location":2,"content":"Um, on the other hand, um,"},{"from":3660.01,"to":3662.22,"location":2,"content":"it's been pretty well, uh, s- known,"},{"from":3662.22,"to":3664.77,"location":2,"content":"and I know you had a lecture on bias and fairness,"},{"from":3664.77,"to":3667.77,"location":2,"content":"that machine learning often reflects bias in a data-set,"},{"from":3667.77,"to":3671.03,"location":2,"content":"um, it can even amplify bias in the data-set."},{"from":3671.03,"to":3672.66,"location":2,"content":"Um, and there's concern of, kind of,"},{"from":3672.66,"to":3675.32,"location":2,"content":"a feedback loop where a biased algorithm"},{"from":3675.32,"to":3678.36,"location":2,"content":"actually will lead to the creation of more biased data,"},{"from":3678.36,"to":3683.15,"location":2,"content":"um, in which case these problems will only compound and get worse."},{"from":3683.15,"to":3688.95,"location":2,"content":"Um, so for all of the, uh, high-impact decisions,"},{"from":3688.95,"to":3690.99,"location":2,"content":"um, I, I had listed on that slide,"},{"from":3690.99,"to":3694.32,"location":2,"content":"there are examples where things have gone awry, right?"},{"from":3694.32,"to":3696.69,"location":2,"content":"So Amazon had some AI that was,"},{"from":3696.69,"to":3699.97,"location":2,"content":"um, working as a recruiting tool and it turned out to be sexist."},{"from":3699.97,"to":3702.26,"location":2,"content":"Um, um, there have been some, kind of,"},{"from":3702.26,"to":3704.55,"location":2,"content":"early pilots of using AI, um,"},{"from":3704.55,"to":3706.68,"location":2,"content":"in the justice system and those also have had,"},{"from":3706.68,"to":3709.71,"location":2,"content":"um, in some cases, really bad results."},{"from":3709.71,"to":3712.92,"location":2,"content":"Um, if you look at automatic,"},{"from":3712.92,"to":3714.86,"location":2,"content":"automatic essay grading, um,"},{"from":3714.86,"to":3716.43,"location":2,"content":"it's not really a great,"},{"from":3716.43,"to":3717.72,"location":2,"content":"you know, NLP system, right?"},{"from":3717.72,"to":3719.73,"location":2,"content":"So here's an example, um,"},{"from":3719.73,"to":3722.36,"location":2,"content":"excerpt of an essay that, um,"},{"from":3722.36,"to":3726.24,"location":2,"content":"a automatic grading system used by the GRE test gives, uh,"},{"from":3726.24,"to":3728.04,"location":2,"content":"a very high score, um,"},{"from":3728.04,"to":3730.23,"location":2,"content":"but really it's just, kind of, a solid of,"},{"from":3730.23,"to":3732.42,"location":2,"content":"uh, big fancy words and that's"},{"from":3732.42,"to":3737.24,"location":2,"content":"enough to convince the model that this is a, a great essay."},{"from":3737.24,"to":3739.41,"location":2,"content":"Um, the last, um,"},{"from":3739.41,"to":3741.55,"location":2,"content":"area I wanna talk about where, where, um,"},{"from":3741.55,"to":3743.55,"location":2,"content":"you can see there's really some risks and"},{"from":3743.55,"to":3746.66,"location":2,"content":"some pitfalls with using NLP technology, is chatbots."},{"from":3746.66,"to":3751.56,"location":2,"content":"Um, so I think chatbots do have a side where they can be very beneficial."},{"from":3751.56,"to":3753.93,"location":2,"content":"Um, Woebot is one example,"},{"from":3753.93,"to":3757.55,"location":2,"content":"is this company that has this chatbot you can talk to if you're not,"},{"from":3757.55,"to":3759.48,"location":2,"content":"um, feeling too great and it'll try to,"},{"from":3759.48,"to":3761.57,"location":2,"content":"um, I don't know, cheer you up."},{"from":3761.57,"to":3763.83,"location":2,"content":"Um, so, so that, you know,"},{"from":3763.83,"to":3766.77,"location":2,"content":"could be a- a really nice piece of technology that helps people,"},{"from":3766.77,"to":3769.38,"location":2,"content":"um, but on the other hand, there's some big risks."},{"from":3769.38,"to":3773.52,"location":2,"content":"So, so one example is Microsoft research had a chatbot trained on tweets,"},{"from":3773.52,"to":3776.85,"location":2,"content":"and it started quickly saying racist things and had to be pulled."},{"from":3776.85,"to":3779.63,"location":2,"content":"Um, so I think all of this highlights that, um,"},{"from":3779.63,"to":3782.51,"location":2,"content":"as NLP is becoming more effective,"},{"from":3782.51,"to":3785.84,"location":2,"content":"people are seeing opportunities to use it in, um,"},{"from":3785.84,"to":3789.3,"location":2,"content":"increasingly high-stakes decisions and although,"},{"from":3789.3,"to":3791.78,"location":2,"content":"you know, there are some nice- there's some appeal to that,"},{"from":3791.78,"to":3794.31,"location":2,"content":"um, there's also a lot of risk."},{"from":3794.31,"to":3797.31,"location":2,"content":"Um, any more questions on, uh,"},{"from":3797.31,"to":3801.65,"location":2,"content":"this sort of social impact of NLP?"},{"from":3801.65,"to":3809.25,"location":2,"content":"Okay. Um, last part of this lecture is looking more at future research, right?"},{"from":3809.25,"to":3810.47,"location":2,"content":"And in particular, um,"},{"from":3810.47,"to":3813.51,"location":2,"content":"I think a lot of the current research trends are,"},{"from":3813.51,"to":3815.76,"location":2,"content":"kind of reactions to BERT, um, right?"},{"from":3815.76,"to":3820.08,"location":2,"content":"So, so the question is what did BERT solve and- and what do we work on next?"},{"from":3820.08,"to":3824.3,"location":2,"content":"Um, so here are results on the GLUE benchmark."},{"from":3824.3,"to":3827.07,"location":2,"content":"Um, that is, uh, a compendium of,"},{"from":3827.07,"to":3830.28,"location":2,"content":"uh, 10 natural language understanding tasks."},{"from":3830.28,"to":3834.42,"location":2,"content":"Um, and you get an average score across those 10 tasks."},{"from":3834.42,"to":3837.81,"location":2,"content":"Um, the left, uh, two- the two are,"},{"from":3837.81,"to":3840.72,"location":2,"content":"sorry the right- two right most models are,"},{"from":3840.72,"to":3843.33,"location":2,"content":"um, uh, s- non, uh,"},{"from":3843.33,"to":3846.48,"location":2,"content":"are just supervised trained machine learning systems, right?"},{"from":3846.48,"to":3848.36,"location":2,"content":"So we have Bag-of-Vectors, um,"},{"from":3848.36,"to":3850.92,"location":2,"content":"we instead use our fancy neural net architecture"},{"from":3850.92,"to":3853.65,"location":2,"content":"of BiLSTM + Attention and we get about five points."},{"from":3853.65,"to":3855.6,"location":2,"content":"Um, but the gains from BERT,"},{"from":3855.6,"to":3857.52,"location":2,"content":"uh, really dwarf that difference, right?"},{"from":3857.52,"to":3859.89,"location":2,"content":"So, so BERT improves results by about, uh,"},{"from":3859.89,"to":3864.12,"location":2,"content":"17 points and we end up being actually quite close,"},{"from":3864.12,"to":3866.93,"location":2,"content":"um, to human performance on these tasks."},{"from":3866.93,"to":3869.82,"location":2,"content":"Um, so one, sort of,"},{"from":3869.82,"to":3872.22,"location":2,"content":"implication of this that people are wondering about is,"},{"from":3872.22,"to":3875.11,"location":2,"content":"is this, kind of, the death of architecture engineering?"},{"from":3875.11,"to":3879.22,"location":2,"content":"Um, so I'm sure all of you who have worked on the default final project, um,"},{"from":3879.22,"to":3882.57,"location":2,"content":"have seen a whole bunch of fancy pictures showing different,"},{"from":3882.57,"to":3884.49,"location":2,"content":"uh, architectures for solving SQuAD."},{"from":3884.49,"to":3886.71,"location":2,"content":"Um, there are a lot of papers."},{"from":3886.71,"to":3888.39,"location":2,"content":"They all propose some, kind of,"},{"from":3888.39,"to":3890.89,"location":2,"content":"uh, attention mechanism or something like that."},{"from":3890.89,"to":3893.88,"location":2,"content":"Um, and, um, right."},{"from":3893.88,"to":3895.17,"location":2,"content":"With BERT, it's, sort of,"},{"from":3895.17,"to":3896.97,"location":2,"content":"um, you don't need to do any of that, right?"},{"from":3896.97,"to":3899.19,"location":2,"content":"You just train a transformer and you give it enough data,"},{"from":3899.19,"to":3901.02,"location":2,"content":"and actually you're doing great on SQuAD,"},{"from":3901.02,"to":3903.89,"location":2,"content":"you know, maybe, um, these, uh,"},{"from":3903.89,"to":3907.8,"location":2,"content":"architectural enhancements are not necessarily, um,"},{"from":3907.8,"to":3910.59,"location":2,"content":"the key thing that'll drive progress in,"},{"from":3910.59,"to":3914.15,"location":2,"content":"uh, improving results on these tasks."},{"from":3914.15,"to":3916.74,"location":2,"content":"Um, right. So, uh,"},{"from":3916.74,"to":3918.63,"location":2,"content":"if you look at this with the perspective of a researcher,"},{"from":3918.63,"to":3920.61,"location":2,"content":"you can think a researcher will say, \"Okay,"},{"from":3920.61,"to":3923.52,"location":2,"content":"I can spend six months designing a fancy new architecture for"},{"from":3923.52,"to":3927.93,"location":2,"content":"SQuAD and if I do a good job maybe I'll improve results by 1, uh, F1 point.\""},{"from":3927.93,"to":3930.03,"location":2,"content":"Um, but in the case of BERT, um,"},{"from":3930.03,"to":3932.16,"location":2,"content":"increasing the size of their model of 3x,"},{"from":3932.16,"to":3933.24,"location":2,"content":"which is the difference between,"},{"from":3933.24,"to":3936.09,"location":2,"content":"they've like a base size model and a large model,"},{"from":3936.09,"to":3939.59,"location":2,"content":"um, that improve results by 5 F1 points."},{"from":3939.59,"to":3942.15,"location":2,"content":"Um, so it does seem to suggest we need to, sort of,"},{"from":3942.15,"to":3946.64,"location":2,"content":"re-prioritize, um, which avenues of research we'd pursue,"},{"from":3946.64,"to":3949.5,"location":2,"content":"because this architecture engineering isn't providing, kind of,"},{"from":3949.5,"to":3952.61,"location":2,"content":"gains for its time investment the way,"},{"from":3952.61,"to":3954.76,"location":2,"content":"uh, leveraging unlabeled data is."},{"from":3954.76,"to":3957.74,"location":2,"content":"Um, so now, if you look at the SQuAD leaderboard, um,"},{"from":3957.74,"to":3964.19,"location":2,"content":"I think at least the top 20 entrants are all BERT plus something."},{"from":3964.19,"to":3967.72,"location":2,"content":"Um, one other issue, uh,"},{"from":3967.72,"to":3969.54,"location":2,"content":"I think BERT has raised is that,"},{"from":3969.54,"to":3971.4,"location":2,"content":"um, we need harder tasks, right?"},{"from":3971.4,"to":3973.56,"location":2,"content":"BERT has almost solved SQuAD,"},{"from":3973.56,"to":3975.06,"location":2,"content":"if you define it by, uh,"},{"from":3975.06,"to":3976.86,"location":2,"content":"getting close to human performance."},{"from":3976.86,"to":3979.23,"location":2,"content":"Um, so there's been, um,"},{"from":3979.23,"to":3982.64,"location":2,"content":"a growth in new datasets that are, uh,"},{"from":3982.64,"to":3985.02,"location":2,"content":"more challenging and there are a couple of ways in which,"},{"from":3985.02,"to":3986.37,"location":2,"content":"um, they can be more challenging."},{"from":3986.37,"to":3988.14,"location":2,"content":"So one is, um,"},{"from":3988.14,"to":3990.24,"location":2,"content":"doing reading comprehension on longer documents,"},{"from":3990.24,"to":3992.63,"location":2,"content":"or doing it across more than one document."},{"from":3992.63,"to":3995.28,"location":2,"content":"Um, one area is looking at c- uh,"},{"from":3995.28,"to":3998.85,"location":2,"content":"coming up with harder questions that require a multi-hop reasoning."},{"from":3998.85,"to":4001.55,"location":2,"content":"Um, so that essentially meas- means you have to string"},{"from":4001.55,"to":4005.18,"location":2,"content":"together multiple supporting facts from different places,"},{"from":4005.18,"to":4007.67,"location":2,"content":"um, to produce the correct answer."},{"from":4007.67,"to":4009.35,"location":2,"content":"Um, and another area,"},{"from":4009.35,"to":4011.87,"location":2,"content":"situating question-answering within a dialogue."},{"from":4011.87,"to":4014.33,"location":2,"content":"Um, there's also been a, kind of,"},{"from":4014.33,"to":4018.26,"location":2,"content":"small detail with the construction of reading comprehension datasets,"},{"from":4018.26,"to":4020.6,"location":2,"content":"that has actually really affected,"},{"from":4020.6,"to":4022.84,"location":2,"content":"um, the, the difficulty of the task."},{"from":4022.84,"to":4024.11,"location":2,"content":"And that is whether, um,"},{"from":4024.11,"to":4026.49,"location":2,"content":"when you create these datasets, um,"},{"from":4026.49,"to":4029.42,"location":2,"content":"is the person who writes questions about a passage,"},{"from":4029.42,"to":4031.53,"location":2,"content":"can they see that passage or not?"},{"from":4031.53,"to":4034.07,"location":2,"content":"Um, so of course, it's much easier to come up"},{"from":4034.07,"to":4036.11,"location":2,"content":"with a question that when you see the passage,"},{"from":4036.11,"to":4038.87,"location":2,"content":"and if you come up with a question without seeing the passage,"},{"from":4038.87,"to":4041.81,"location":2,"content":"you may not even have a answerable question."},{"from":4041.81,"to":4043.73,"location":2,"content":"Um, but the problem with looking at"},{"from":4043.73,"to":4046.46,"location":2,"content":"the passage is that first of all it's not realistic, right?"},{"from":4046.46,"to":4048.84,"location":2,"content":"So, uh, if I'm asking a question, you know,"},{"from":4048.84,"to":4050.59,"location":2,"content":"I'm not going to have usually"},{"from":4050.59,"to":4053.87,"location":2,"content":"the paragraph that answers that question sitting in front of me."},{"from":4053.87,"to":4055.67,"location":2,"content":"Um, on top of that,"},{"from":4055.67,"to":4057.56,"location":2,"content":"it really encourages easy questions, right?"},{"from":4057.56,"to":4059.84,"location":2,"content":"So, um, if you're a Mechanical Turker,"},{"from":4059.84,"to":4062.87,"location":2,"content":"and you're paid to write as many questions as possible,"},{"from":4062.87,"to":4064.79,"location":2,"content":"and then you see an article that says,"},{"from":4064.79,"to":4066.35,"location":2,"content":"um, I don't know, you know,"},{"from":4066.35,"to":4070.04,"location":2,"content":"uh, Abraham Lincoln was the 16th president of the United States,"},{"from":4070.04,"to":4071.6,"location":2,"content":"um, what are you gonna write?"},{"from":4071.6,"to":4073.1,"location":2,"content":"As your question, you're gonna write,"},{"from":4073.1,"to":4075.36,"location":2,"content":"who was the 16th president of the United States."},{"from":4075.36,"to":4078.03,"location":2,"content":"You're not gonna write something more interesting that's harder to answer."},{"from":4078.03,"to":4081.89,"location":2,"content":"Um, so- so this is one way in which crowdsourced datasets have changed, um,"},{"from":4081.89,"to":4084.17,"location":2,"content":"people are now making sure questions are,"},{"from":4084.17,"to":4087.41,"location":2,"content":"sort of, independent of, of the contexts."},{"from":4087.41,"to":4089.38,"location":2,"content":"Um, so I'm gonna briefly, uh,"},{"from":4089.38,"to":4091.61,"location":2,"content":"go over a couple of new datasets in this line."},{"from":4091.61,"to":4095.15,"location":2,"content":"So one is called QuAC, which stands for Question Answering in Context."},{"from":4095.15,"to":4096.81,"location":2,"content":"Um, in this dataset,"},{"from":4096.81,"to":4098.69,"location":2,"content":"there is a teacher and a student,"},{"from":4098.69,"to":4101.39,"location":2,"content":"um, the teacher sees a Wikipedia article."},{"from":4101.39,"to":4104.19,"location":2,"content":"The student wants to learn about this Wikipedia article,"},{"from":4104.19,"to":4108.01,"location":2,"content":"and the goal is to train a machine learning model that acts as the teacher."},{"from":4108.01,"to":4110,"location":2,"content":"Um, so you can imagine maybe in the future, this,"},{"from":4110,"to":4112.19,"location":2,"content":"sort of, technology would be useful for,"},{"from":4112.19,"to":4114.32,"location":2,"content":"uh, um, education for, kind of,"},{"from":4114.32,"to":4117.03,"location":2,"content":"having, uh, adding some automation."},{"from":4117.03,"to":4122.49,"location":2,"content":"Um, uh, one thing that makes this task difficult is that,"},{"from":4122.49,"to":4126.55,"location":2,"content":"uh, questions depend on the entire history of the conversation."},{"from":4126.55,"to":4128.23,"location":2,"content":"Um, so for example, uh,"},{"from":4128.23,"to":4130.79,"location":2,"content":"if you look, um, on the left here, uh,"},{"from":4130.79,"to":4134.81,"location":2,"content":"the example, um, dialogue,"},{"from":4134.81,"to":4137.31,"location":2,"content":"um, the third question is was he the star?"},{"from":4137.31,"to":4142.07,"location":2,"content":"Um, clearly you can't answer that question unless you look back earlier in the dialogue,"},{"from":4142.07,"to":4144.1,"location":2,"content":"and realize that the subject of this,"},{"from":4144.1,"to":4146.18,"location":2,"content":"uh, conversation is Daffy Duck."},{"from":4146.18,"to":4149.06,"location":2,"content":"Um, a- and, sort of,"},{"from":4149.06,"to":4151.04,"location":2,"content":"because this dataset is more challenging,"},{"from":4151.04,"to":4154.34,"location":2,"content":"and you can see there's a, there's a much bigger gap to human performance, right?"},{"from":4154.34,"to":4157.61,"location":2,"content":"So if you train some BERT with some extensions, you'll st- uh,"},{"from":4157.61,"to":4162.19,"location":2,"content":"the results are still like 15 F1 points worse than human performance."},{"from":4162.19,"to":4168.94,"location":2,"content":"Um, um, here's one other dataset, um, called HotPotQA."},{"from":4168.94,"to":4170.51,"location":2,"content":"Um, it is, uh,"},{"from":4170.51,"to":4172.76,"location":2,"content":"designed instead for multi-hop reasoning."},{"from":4172.76,"to":4175.61,"location":2,"content":"Um, so essentially, in order to answer a question,"},{"from":4175.61,"to":4177.88,"location":2,"content":"you have to look at multiple documents,"},{"from":4177.88,"to":4180.35,"location":2,"content":"you have to look at different facts from those documents,"},{"from":4180.35,"to":4181.93,"location":2,"content":"and perform some inference,"},{"from":4181.93,"to":4184.65,"location":2,"content":"um, to get what the correct answer is."},{"from":4184.65,"to":4188.65,"location":2,"content":"Um, so I think, you know, this is a- a much harder task."},{"from":4188.65,"to":4194.59,"location":2,"content":"And again, um, there's a much bigger gap between human performance."},{"from":4194.59,"to":4197.39,"location":2,"content":"Um, any questions on, uh,"},{"from":4197.39,"to":4201.9,"location":2,"content":"new datasets, um, harder chi- tasks for NLP?"},{"from":4201.9,"to":4207.03,"location":2,"content":"Okay. Um, I'm gonna,"},{"from":4207.03,"to":4209.36,"location":2,"content":"kind of, rapid fire and go through, um,"},{"from":4209.36,"to":4212.21,"location":2,"content":"a couple of more areas in the last minutes of this talk."},{"from":4212.21,"to":4216.34,"location":2,"content":"Um, so multitask learning I think is really growing in importance."},{"from":4216.34,"to":4218.39,"location":2,"content":"Um, of course, um,"},{"from":4218.39,"to":4220.19,"location":2,"content":"you've had a whole lecture on this, right?"},{"from":4220.19,"to":4221.75,"location":2,"content":"So I'm not gonna spend too much time on it."},{"from":4221.75,"to":4224.33,"location":2,"content":"Um, but maybe one, uh,"},{"from":4224.33,"to":4228.92,"location":2,"content":"point of interest is that if you look at performance on this GLUE benchmark,"},{"from":4228.92,"to":4231.32,"location":2,"content":"so this benchmark for natural language understanding,"},{"from":4231.32,"to":4234.92,"location":2,"content":"um, all the top couple results, um,"},{"from":4234.92,"to":4237.98,"location":2,"content":"are- that are now actually surpassing BERT in"},{"from":4237.98,"to":4242.39,"location":2,"content":"performance are- is taking BERT and training it in a multi-task way."},{"from":4242.39,"to":4247.37,"location":2,"content":"Um, I think another interesting, uh,"},{"from":4247.37,"to":4252.02,"location":2,"content":"motivation for multi-task learning is that if you are training BERT, you have a really,"},{"from":4252.02,"to":4254.48,"location":2,"content":"really large model and one way to make"},{"from":4254.48,"to":4260.95,"location":2,"content":"more efficient use of that model is training it to do many things at once."},{"from":4260.95,"to":4264.92,"location":2,"content":"Another area that's definitely important, um,"},{"from":4264.92,"to":4269.09,"location":2,"content":"and I think will be important going in the future is dealing with low-resource settings."},{"from":4269.09,"to":4270.89,"location":2,"content":"Um, and here I'm using a really broad,"},{"from":4270.89,"to":4273.02,"location":2,"content":"uh, definition of resources, right."},{"from":4273.02,"to":4275.44,"location":2,"content":"So that could mean compute power, um, you know,"},{"from":4275.44,"to":4278.99,"location":2,"content":"BERT is great but it also takes huge amounts of compute to run it."},{"from":4278.99,"to":4280.31,"location":2,"content":"So it's not realistic to say,"},{"from":4280.31,"to":4282.55,"location":2,"content":"um, if you're building, let's say a mobile, uh,"},{"from":4282.55,"to":4287.51,"location":2,"content":"an app for a mobile device that you could run a model the size of BERT."},{"from":4287.51,"to":4291.85,"location":2,"content":"Um, as I already ga- went into earlier in this talk, um, you know,"},{"from":4291.85,"to":4296.23,"location":2,"content":"low-resource languages is an area that I think is pretty, um,"},{"from":4296.23,"to":4299.12,"location":2,"content":"under-represented in NLP research right now,"},{"from":4299.12,"to":4301.46,"location":2,"content":"because most datasets are in English, um,"},{"from":4301.46,"to":4302.57,"location":2,"content":"but I do think, right,"},{"from":4302.57,"to":4304.13,"location":2,"content":"there's a really, you know,"},{"from":4304.13,"to":4309.24,"location":2,"content":"large number of people that in order to benefit from NLP technology, um,"},{"from":4309.24,"to":4312.2,"location":2,"content":"we'll need to have technologies that work well in a lot of"},{"from":4312.2,"to":4316.06,"location":2,"content":"different languages especially those without much training data."},{"from":4316.06,"to":4320.87,"location":2,"content":"And, um, speaking of low- low amounts of training data, I think in general this is,"},{"from":4320.87,"to":4324.06,"location":2,"content":"uh, a- an interesting area of research,"},{"from":4324.06,"to":4325.55,"location":2,"content":"um, within machine learning."},{"from":4325.55,"to":4327.31,"location":2,"content":"Actually, people are, um,"},{"from":4327.31,"to":4329.31,"location":2,"content":"working a lot on this as well."},{"from":4329.31,"to":4331.46,"location":2,"content":"Um, so a term is often, uh,"},{"from":4331.46,"to":4334.02,"location":2,"content":"a term often used is few shot learning."},{"from":4334.02,"to":4336.41,"location":2,"content":"Um, and that essentially means being able to"},{"from":4336.41,"to":4338.72,"location":2,"content":"train a machine learning model that only sees,"},{"from":4338.72,"to":4340.73,"location":2,"content":"let's say five or ten examples."},{"from":4340.73,"to":4343.37,"location":2,"content":"Um, one motivation there is, um,"},{"from":4343.37,"to":4349.44,"location":2,"content":"I think a clear distinction between how our existing machine learning systems learn,"},{"from":4349.44,"to":4351.88,"location":2,"content":"and how humans learn is that, um,"},{"from":4351.88,"to":4355.55,"location":2,"content":"humans can generalize very quickly from five or so examples."},{"from":4355.55,"to":4357.19,"location":2,"content":"Um, if you're training a neural net,"},{"from":4357.19,"to":4358.58,"location":2,"content":"you normally need, you know,"},{"from":4358.58,"to":4361.61,"location":2,"content":"thousands of examples or perhaps even tens of thousands,"},{"from":4361.61,"to":4365.06,"location":2,"content":"hundreds of thousands of examples to get something that works."},{"from":4365.06,"to":4369.65,"location":2,"content":"Um, so I also see this being a pretty important area in the future."},{"from":4369.65,"to":4373.73,"location":2,"content":"Um, the last area where I want to go in, um,"},{"from":4373.73,"to":4377.6,"location":2,"content":"a little bit more depth is interpreting and understanding models."},{"from":4377.6,"to":4380.57,"location":2,"content":"Um, so, so really there's two aspects of this."},{"from":4380.57,"to":4384.1,"location":2,"content":"One is if I have a machine learning model and it makes a prediction,"},{"from":4384.1,"to":4386.45,"location":2,"content":"I would like to be able to, uh,"},{"from":4386.45,"to":4388.79,"location":2,"content":"know why did it make that prediction?"},{"from":4388.79,"to":4391.39,"location":2,"content":"So gets some rationale, get some explanation,"},{"from":4391.39,"to":4395.18,"location":2,"content":"um, that would especially be important in an area like health care, right?"},{"from":4395.18,"to":4397.91,"location":2,"content":"So if you're a doctor and you're making a decision, um,"},{"from":4397.91,"to":4401.09,"location":2,"content":"it's probably not good enough for your machine learning model to say,"},{"from":4401.09,"to":4402.47,"location":2,"content":"\"Patient has disease X.\""},{"from":4402.47,"to":4403.81,"location":2,"content":"You really want it to say,"},{"from":4403.81,"to":4406.07,"location":2,"content":"\"Patient has disease X for these reasons.\""},{"from":4406.07,"to":4408.59,"location":2,"content":"Um, because then you as a doctor can double-check,"},{"from":4408.59,"to":4410.54,"location":2,"content":"and, and try to validate the, the,"},{"from":4410.54,"to":4413.16,"location":2,"content":"uh, machine's, um, thinking I guess,"},{"from":4413.16,"to":4415.61,"location":2,"content":"um, to come up with that diagnosis."},{"from":4415.61,"to":4418.64,"location":2,"content":"Um, the other area of interpreting"},{"from":4418.64,"to":4421.37,"location":2,"content":"understanding models is more of a scientific question, right?"},{"from":4421.37,"to":4423.86,"location":2,"content":"Is we know things like BERT work really well,"},{"from":4423.86,"to":4425.96,"location":2,"content":"um, we want to know why do they work well?"},{"from":4425.96,"to":4428.19,"location":2,"content":"What -what what aspects of language do they model?"},{"from":4428.19,"to":4429.99,"location":2,"content":"Um, what things don't they model?"},{"from":4429.99,"to":4432.02,"location":2,"content":"Um, and that might lead to, um,"},{"from":4432.02,"to":4435.69,"location":2,"content":"ideas of improving, um, those- those models."},{"from":4435.69,"to":4439.58,"location":2,"content":"Um, so, um, here is a, uh,"},{"from":4439.58,"to":4444.94,"location":2,"content":"couple slides on the main approach for evalu- answering the sort of scientific questions."},{"from":4444.94,"to":4446.98,"location":2,"content":"What does a machine-learning model learn?"},{"from":4446.98,"to":4450.53,"location":2,"content":"Um, what you do is you have a model so let's say it's BERT."},{"from":4450.53,"to":4453.44,"location":2,"content":"It takes as input a sequence of words, um,"},{"from":4453.44,"to":4456.47,"location":2,"content":"it produces as output a sequence of vectors, um,"},{"from":4456.47,"to":4458.57,"location":2,"content":"we want to ask does it know for example,"},{"from":4458.57,"to":4459.68,"location":2,"content":"the part of speech of words?"},{"from":4459.68,"to":4462.45,"location":2,"content":"So, so it does in its vector representations,"},{"from":4462.45,"to":4464.63,"location":2,"content":"does that capture something about syntax?"},{"from":4464.63,"to":4469.85,"location":2,"content":"Um, and a simple way of asking this question is train another classifier on top of BERT,"},{"from":4469.85,"to":4471.97,"location":2,"content":"uh, that's trained to do,"},{"from":4471.97,"to":4474.4,"location":2,"content":"um, let's say part-of-speech tagging."},{"from":4474.4,"to":4476.82,"location":2,"content":"Um, but we only, um,"},{"from":4476.82,"to":4479.94,"location":2,"content":"backprop into that diagnostic classifier itself."},{"from":4479.94,"to":4483.68,"location":2,"content":"So in other words we're treating the output of BERT, um,"},{"from":4483.68,"to":4486.19,"location":2,"content":"that sequence of vectors as a fixed input,"},{"from":4486.19,"to":4488.6,"location":2,"content":"and we're sort of probing those vectors to see,"},{"from":4488.6,"to":4490.51,"location":2,"content":"um, do they contain, um,"},{"from":4490.51,"to":4492.44,"location":2,"content":"information about a part of speech that"},{"from":4492.44,"to":4496.44,"location":2,"content":"this second diagnostic classifier on top can decode,"},{"from":4496.44,"to":4499.12,"location":2,"content":"um, to get the correct labels?"},{"from":4499.12,"to":4503.69,"location":2,"content":"Um, so, um, it was kind of quite a few concerns here."},{"from":4503.69,"to":4506.54,"location":2,"content":"Um, one concern is, uh,"},{"from":4506.54,"to":4509.91,"location":2,"content":"if you make your diagnostic classifier too complicated,"},{"from":4509.91,"to":4513.2,"location":2,"content":"it can just solve the classif- the task all on itself,"},{"from":4513.2,"to":4515.21,"location":2,"content":"and it can basically ignore, uh,"},{"from":4515.21,"to":4517.56,"location":2,"content":"whatever representations were produced by BERT."},{"from":4517.56,"to":4520.04,"location":2,"content":"Um, so- so the kind of standard thing right now is to use"},{"from":4520.04,"to":4523.2,"location":2,"content":"a single softmax layer on top of BERT,"},{"from":4523.2,"to":4525.19,"location":2,"content":"um, to do these decisions."},{"from":4525.19,"to":4529.1,"location":2,"content":"Um, and there's been a whole bunch of tasks proposed for"},{"from":4529.1,"to":4532.9,"location":2,"content":"evaluating essentially the linguistic knowledge of these models."},{"from":4532.9,"to":4534.78,"location":2,"content":"Um, so you could do part-of-speech tagging,"},{"from":4534.78,"to":4537.08,"location":2,"content":"you could do more semantic tasks like,"},{"from":4537.08,"to":4539.28,"location":2,"content":"uh, relation extraction, um,"},{"from":4539.28,"to":4541.27,"location":2,"content":"or- or something like co-reference."},{"from":4541.27,"to":4544.28,"location":2,"content":"Um, and this is a pretty active area of work."},{"from":4544.28,"to":4547.06,"location":2,"content":"Um, here is, uh, just one, uh,"},{"from":4547.06,"to":4551.19,"location":2,"content":"plot showing some of the results, um, of this approach."},{"from":4551.19,"to":4553.86,"location":2,"content":"So here what we're doing is we're adding"},{"from":4553.86,"to":4556.95,"location":2,"content":"diagnostic classifiers to different layers of BERT,"},{"from":4556.95,"to":4562.62,"location":2,"content":"and we are seeing which layers of BERT are more useful for particular tasks."},{"from":4562.62,"to":4567.02,"location":2,"content":"Um, and, um, something kind of interesting comes out of this which is that, um,"},{"from":4567.02,"to":4570.31,"location":2,"content":"the different layers of BERT seem to be corresponding, um,"},{"from":4570.31,"to":4572.89,"location":2,"content":"fairly well with notions of,"},{"from":4572.89,"to":4575.4,"location":2,"content":"uh, different layers of li- of linguistics."},{"from":4575.4,"to":4579.11,"location":2,"content":"Um, so, uh, dependency parsing which is a syntactic task,"},{"from":4579.11,"to":4580.94,"location":2,"content":"um, it's, uh, considered sort of a, you know,"},{"from":4580.94,"to":4583.43,"location":2,"content":"medium level task in understanding a sentence."},{"from":4583.43,"to":4588.13,"location":2,"content":"Um, the medium layers of BERT, so layers kind of 6 through 8 or something,"},{"from":4588.13,"to":4590.48,"location":2,"content":"are the ones best at dependency parsing."},{"from":4590.48,"to":4594.1,"location":2,"content":"Um, if you have a se- very semantic task like sentiment analysis,"},{"from":4594.1,"to":4595.88,"location":2,"content":"um, where you're trying to learn some kind of, uh,"},{"from":4595.88,"to":4598.32,"location":2,"content":"semantic property of the whole sentence, um,"},{"from":4598.32,"to":4601.49,"location":2,"content":"then the very last layers of BERT are the ones that seem"},{"from":4601.49,"to":4606.31,"location":2,"content":"to encode the most information about- about this, uh, phenomenon."},{"from":4606.31,"to":4608.69,"location":2,"content":"Um, okay."},{"from":4608.69,"to":4610.84,"location":2,"content":"So this is almost it for the talk, um,"},{"from":4610.84,"to":4614.6,"location":2,"content":"I just have one slide here of, uh, um,"},{"from":4614.6,"to":4617.87,"location":2,"content":"NLP not in kind of the academic researching context,"},{"from":4617.87,"to":4620.73,"location":2,"content":"which I have already been talking a lot about but NLP in industry,"},{"from":4620.73,"to":4623.07,"location":2,"content":"and really there's rapid progress there."},{"from":4623.07,"to":4626.31,"location":2,"content":"And I wanted to point to you two areas where I think there's"},{"from":4626.31,"to":4630.65,"location":2,"content":"especially a large interest in using NLP technology."},{"from":4630.65,"to":4632.24,"location":2,"content":"Um, one is dialogue,"},{"from":4632.24,"to":4634.01,"location":2,"content":"um, so for things like chatbots, right?"},{"from":4634.01,"to":4637.58,"location":2,"content":"There's the Alexa Prize where they're actually investing a lot of money in,"},{"from":4637.58,"to":4641.1,"location":2,"content":"um, having groups figure out how to improve chitchat dialogue."},{"from":4641.1,"to":4645.23,"location":2,"content":"Um, there's also I think a lot of potential for customer service, right?"},{"from":4645.23,"to":4648.17,"location":2,"content":"So improving basically automated systems that'll, um,"},{"from":4648.17,"to":4649.58,"location":2,"content":"you know, book you a flight,"},{"from":4649.58,"to":4652.39,"location":2,"content":"or help you cancel a subscription, or anything like that."},{"from":4652.39,"to":4655.46,"location":2,"content":"Um, and similarly, there's a lot of potential in health care."},{"from":4655.46,"to":4659.18,"location":2,"content":"Um, one is understanding the records of someone who,"},{"from":4659.18,"to":4662.06,"location":2,"content":"um, is sick and to help them- to help with diagnoses."},{"from":4662.06,"to":4663.94,"location":2,"content":"Um, I think another, um,"},{"from":4663.94,"to":4666.22,"location":2,"content":"equally important area is actually, uh,"},{"from":4666.22,"to":4669.02,"location":2,"content":"parsing, uh, biomedical papers."},{"from":4669.02,"to":4674.28,"location":2,"content":"Um, so, um, the number of biomedical papers that are being written is really insane,"},{"from":4674.28,"to":4676.1,"location":2,"content":"um, it's, it's way larger than the number"},{"from":4676.1,"to":4677.96,"location":2,"content":"of computer science papers that are being written."},{"from":4677.96,"to":4681.53,"location":2,"content":"[NOISE] Um, often if you're a doctor,"},{"from":4681.53,"to":4683.15,"location":2,"content":"or if you're a researcher, um,"},{"from":4683.15,"to":4686.36,"location":2,"content":"in medicine, you might want to look up something very specific, right?"},{"from":4686.36,"to":4687.62,"location":2,"content":"You might want to know what is"},{"from":4687.62,"to":4691.37,"location":2,"content":"the effect of this particular drug on this particular gene,"},{"from":4691.37,"to":4693.14,"location":2,"content":"or a cell with this particular gene."},{"from":4693.14,"to":4696.71,"location":2,"content":"Um, there's no good way right now of searching through, um,"},{"from":4696.71,"to":4700.18,"location":2,"content":"hundreds of thousands of papers to find if someone has a- has, uh,"},{"from":4700.18,"to":4703.09,"location":2,"content":"done this experiment and have results for this,"},{"from":4703.09,"to":4705.1,"location":2,"content":"um, particular combination of things."},{"from":4705.1,"to":4708.59,"location":2,"content":"Um, so automated reading of all this biomedical literature,"},{"from":4708.59,"to":4711.1,"location":2,"content":"um, could have a lot of value."},{"from":4711.1,"to":4713.96,"location":2,"content":"Okay, um, to conclude, um,"},{"from":4713.96,"to":4718.28,"location":2,"content":"there's been rapid progress in the last five years due to deep learning, um, in NLP."},{"from":4718.28,"to":4722.78,"location":2,"content":"Um, in the last year, we've seen another really kind of, uh,"},{"from":4722.78,"to":4725.3,"location":2,"content":"a dramatic increase in the capability of our systems,"},{"from":4725.3,"to":4727.61,"location":2,"content":"thanks to, uh, using unlabeled data."},{"from":4727.61,"to":4729.1,"location":2,"content":"So that's methods like BERT."},{"from":4729.1,"to":4734.21,"location":2,"content":"Um, and, um, the other kind of thing that's I think important to think about is that,"},{"from":4734.21,"to":4738.17,"location":2,"content":"NLP systems are starting to be at a place where they can have big social impact."},{"from":4738.17,"to":4744.85,"location":2,"content":"Um, so that makes some issues like bias and security very important. Um, thank you."},{"from":4744.85,"to":4746.69,"location":2,"content":"Uh, good luck finishing all your projects."},{"from":4746.69,"to":4754.8,"location":2,"content":"[APPLAUSE]."}]} \ No newline at end of file diff --git a/bcc-en/3.bcc b/bcc-en/3.bcc new file mode 100644 index 0000000000000000000000000000000000000000..3bd5242269715ff8e06326d561e79891f7c4f181 --- /dev/null +++ b/bcc-en/3.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.57,"to":10.11,"location":2,"content":"okay hi everyone okay let's get started"},{"from":10.11,"to":12.88,"location":2,"content":"great to see you all here welcome back"},{"from":12.88,"to":20.05,"location":2,"content":"for week 2 our CS 224 in so so this is a"},{"from":20.05,"to":22.54,"location":2,"content":"little preview of what's coming up in"},{"from":22.54,"to":25.71,"location":2,"content":"the class for this week and next week"},{"from":25.71,"to":28.36,"location":2,"content":"you know this week is perhaps the worst"},{"from":28.36,"to":31.75,"location":2,"content":"week of this class so in week two of the"},{"from":31.75,"to":36.73,"location":2,"content":"class our hope is to actually kind of go"},{"from":36.73,"to":39.07,"location":2,"content":"through some of the nitty-gritty of"},{"from":39.07,"to":42.7,"location":2,"content":"neural networks and how they're trained"},{"from":42.7,"to":46.42,"location":2,"content":"and how we can learn good neural"},{"from":46.42,"to":49.3,"location":2,"content":"networks by backpropagation which means"},{"from":49.3,"to":51.55,"location":2,"content":"in particular we're going to be sort of"},{"from":51.55,"to":53.32,"location":2,"content":"talking about the training algorithms"},{"from":53.32,"to":56.59,"location":2,"content":"and doing calculus to work out gradients"},{"from":56.59,"to":60.37,"location":2,"content":"for improving them so we'll look a bit a"},{"from":60.37,"to":65.32,"location":2,"content":"little bit word window classification"},{"from":65.32,"to":67.51,"location":2,"content":"named entity recognition so there's a"},{"from":67.51,"to":69.79,"location":2,"content":"teeny bit of natural language processing"},{"from":69.79,"to":73.57,"location":2,"content":"in there but basically sort of week two"},{"from":73.57,"to":78.07,"location":2,"content":"is sort of math of deep learning and"},{"from":78.07,"to":80.62,"location":2,"content":"neural network models and sort of really"},{"from":80.62,"to":83.74,"location":2,"content":"knew all Network fundamentals but the"},{"from":83.74,"to":86.17,"location":2,"content":"hope is that that will give you kind of"},{"from":86.17,"to":88.69,"location":2,"content":"good understanding of how these things"},{"from":88.69,"to":90.79,"location":2,"content":"really work and will give you all the"},{"from":90.79,"to":93.37,"location":2,"content":"information you need to do the coming-up"},{"from":93.37,"to":97.15,"location":2,"content":"homework and so then in week 3 we kind"},{"from":97.15,"to":100.48,"location":2,"content":"of flip so then week three is going to"},{"from":100.48,"to":102.73,"location":2,"content":"be mainly about natural language"},{"from":102.73,"to":104.53,"location":2,"content":"processing so we're then going to talk"},{"from":104.53,"to":106.72,"location":2,"content":"about how to port syntactic structures"},{"from":106.72,"to":109.57,"location":2,"content":"over sentences for building dependency"},{"from":109.57,"to":111.7,"location":2,"content":"parsers of sentences which is then"},{"from":111.7,"to":114.4,"location":2,"content":"actually what's used in homework 3 so"},{"from":114.4,"to":116.38,"location":2,"content":"we're chugging along rapidly and then"},{"from":116.38,"to":118.36,"location":2,"content":"we'll talk about this idea of the"},{"from":118.36,"to":120.31,"location":2,"content":"probability of a sentence which leads"},{"from":120.31,"to":124.78,"location":2,"content":"into neural language models so on the"},{"from":124.78,"to":127.32,"location":2,"content":"homeworks homework 1 was due"},{"from":127.32,"to":130.36,"location":2,"content":"approximately two minutes ago so I hope"},{"from":130.36,"to":132.97,"location":2,"content":"everyone has submitted their homework"},{"from":132.97,"to":138.73,"location":2,"content":"one I mean as one just sort of add"},{"from":138.73,"to":141.37,"location":2,"content":"in general I thought you know homework 1"},{"from":141.37,"to":144.01,"location":2,"content":"we hope you found was a good warm-up and"},{"from":144.01,"to":146.56,"location":2,"content":"not too too hard and so it really be"},{"from":146.56,"to":149.26,"location":2,"content":"best to get homework 1 in quickly rather"},{"from":149.26,"to":151.3,"location":2,"content":"than to burn lots of your late days"},{"from":151.3,"to":154.33,"location":2,"content":"doing homework 1 and now right now out"},{"from":154.33,"to":157.75,"location":2,"content":"on the website there's homework too so"},{"from":157.75,"to":159.82,"location":2,"content":"we're chugging along"},{"from":159.82,"to":162.49,"location":2,"content":"so homework 2 kind of corresponds to"},{"from":162.49,"to":164.65,"location":2,"content":"this week's lectures so on the first"},{"from":164.65,"to":165.49,"location":2,"content":"part of it"},{"from":165.49,"to":168.19,"location":2,"content":"we are expecting you to grind through"},{"from":168.19,"to":170.17,"location":2,"content":"some math problems of working out"},{"from":170.17,"to":173.56,"location":2,"content":"gradient derivations and then the second"},{"from":173.56,"to":176.23,"location":2,"content":"part of it is then implementing your own"},{"from":176.23,"to":178.39,"location":2,"content":"version of word Tyvek making use of"},{"from":178.39,"to":181.39,"location":2,"content":"numpy and so this time sort of writing a"},{"from":181.39,"to":183.25,"location":2,"content":"Python program it's no longer an I"},{"from":183.25,"to":186.34,"location":2,"content":"Python notebook encourage you to get"},{"from":186.34,"to":191.83,"location":2,"content":"early look at the materials on the web I"},{"from":191.83,"to":193.66,"location":2,"content":"mean in particular corresponding to"},{"from":193.66,"to":196.39,"location":2,"content":"today's lecture there's some quite good"},{"from":196.39,"to":199.03,"location":2,"content":"tutorial materials that are available on"},{"from":199.03,"to":201.13,"location":2,"content":"the website and so also encourage you to"},{"from":201.13,"to":204.94,"location":2,"content":"look at those more generally just to"},{"from":204.94,"to":207.85,"location":2,"content":"make a couple more comments on things I"},{"from":207.85,"to":211,"location":2,"content":"mean I guess this is true of a lot of"},{"from":211,"to":213.94,"location":2,"content":"classes at Stanford but you know when we"},{"from":213.94,"to":216.76,"location":2,"content":"get the course reviews for this class we"},{"from":216.76,"to":219.19,"location":2,"content":"always get the full spectrum from people"},{"from":219.19,"to":221.35,"location":2,"content":"who say the class is terrible and it's"},{"from":221.35,"to":223.96,"location":2,"content":"way too much work to people who say it's"},{"from":223.96,"to":225.37,"location":2,"content":"a really great class one of their"},{"from":225.37,"to":226.87,"location":2,"content":"favorite classes at Stanford"},{"from":226.87,"to":229.78,"location":2,"content":"obviously instructors care etc and I"},{"from":229.78,"to":232.09,"location":2,"content":"mean probably this reflects that we get"},{"from":232.09,"to":235.18,"location":2,"content":"this very wide range of people coming to"},{"from":235.18,"to":237.7,"location":2,"content":"take this class or on the one hand on"},{"from":237.7,"to":239.71,"location":2,"content":"the right-hand margin perhaps we have"},{"from":239.71,"to":242.11,"location":2,"content":"the physics PhDs and on the left-hand"},{"from":242.11,"to":244.51,"location":2,"content":"margin we have some frosh who think this"},{"from":244.51,"to":246.28,"location":2,"content":"will be fun to do anyway"},{"from":246.28,"to":249.46,"location":2,"content":"we welcome it we welcome everybody but"},{"from":249.46,"to":251.83,"location":2,"content":"in principle this is a graduate level"},{"from":251.83,"to":254.35,"location":2,"content":"class you know that doesn't mean we want"},{"from":254.35,"to":256.51,"location":2,"content":"to fail people out we'd like everyone to"},{"from":256.51,"to":259.54,"location":2,"content":"succeed but also like graduate level"},{"from":259.54,"to":262.45,"location":2,"content":"class we'd like you to here take some"},{"from":262.45,"to":265.21,"location":2,"content":"initiative in your success meaning if"},{"from":265.21,"to":266.65,"location":2,"content":"there are things that you need to know"},{"from":266.65,"to":268.63,"location":2,"content":"to do the assignments and you don't know"},{"from":268.63,"to":271.54,"location":2,"content":"them then you should be taking some"},{"from":271.54,"to":271.87,"location":2,"content":"initial"},{"from":271.87,"to":274.36,"location":2,"content":"to find some tutorials come to office"},{"from":274.36,"to":277.21,"location":2,"content":"hours and talk to people and get any"},{"from":277.21,"to":280.24,"location":2,"content":"help you need and learn to sort of for"},{"from":280.24,"to":282.28,"location":2,"content":"any holes in your knowledge okay so"},{"from":282.28,"to":285.73,"location":2,"content":"here's the plan for today so that was"},{"from":285.73,"to":287.89,"location":2,"content":"the course information update so you"},{"from":287.89,"to":290.56,"location":2,"content":"know this is sort of in some sense you"},{"from":290.56,"to":292.6,"location":2,"content":"know machine learning neural nets intro"},{"from":292.6,"to":295,"location":2,"content":"just to try and make sure everyone is up"},{"from":295,"to":296.95,"location":2,"content":"to speed on all this stuff so talk a"},{"from":296.95,"to":299.16,"location":2,"content":"little bit about classification"},{"from":299.16,"to":302.62,"location":2,"content":"introduced neural networks little detour"},{"from":302.62,"to":305.11,"location":2,"content":"and named entity recognition in sort of"},{"from":305.11,"to":309.49,"location":2,"content":"show a model of doing window would"},{"from":309.49,"to":312.19,"location":2,"content":"window classification and then at the"},{"from":312.19,"to":315.07,"location":2,"content":"end part we sort of then dive deeper"},{"from":315.07,"to":318.22,"location":2,"content":"into what kind of tools we need to learn"},{"from":318.22,"to":321.58,"location":2,"content":"neural networks and so today we're going"},{"from":321.58,"to":324.94,"location":2,"content":"to go through somewhere between review"},{"from":324.94,"to":328.66,"location":2,"content":"and primer of matrix calculus and then"},{"from":328.66,"to":331.21,"location":2,"content":"that will lead into next times lecture"},{"from":331.21,"to":333.79,"location":2,"content":"where it's talking more about back"},{"from":333.79,"to":337.69,"location":2,"content":"propagation and computation graphs so"},{"from":337.69,"to":341.77,"location":2,"content":"yeah so this material especially the"},{"from":341.77,"to":343.56,"location":2,"content":"part at the end you know for some people"},{"from":343.56,"to":347.05,"location":2,"content":"it'll seem really babyish if it's the"},{"from":347.05,"to":349.51,"location":2,"content":"kind of stuff you do every week for"},{"from":349.51,"to":352.36,"location":2,"content":"other people it might seem impossibly"},{"from":352.36,"to":354.64,"location":2,"content":"difficult but hopefully for a large"},{"from":354.64,"to":356.44,"location":2,"content":"percentage of you in the middle this"},{"from":356.44,"to":359.71,"location":2,"content":"will be kind of a useful review of doing"},{"from":359.71,"to":361.84,"location":2,"content":"this kind of matrix calculus and the"},{"from":361.84,"to":363.4,"location":2,"content":"kind of things that we hope that you can"},{"from":363.4,"to":370.51,"location":2,"content":"do on homework 2 ok so yeah so sorry if"},{"from":370.51,"to":372.43,"location":2,"content":"I'm boring some people if you set"},{"from":372.43,"to":376.72,"location":2,"content":"through 229 last quarter you saw what a"},{"from":376.72,"to":379.39,"location":2,"content":"classifier was like and hopefully this"},{"from":379.39,"to":381.79,"location":2,"content":"will seem familiar but I'm just sort of"},{"from":381.79,"to":384.1,"location":2,"content":"hoping to try and have everyone in week"},{"from":384.1,"to":386.2,"location":2,"content":"2 sort of up to speed and I'm roughly"},{"from":386.2,"to":387.91,"location":2,"content":"the same page so here's our"},{"from":387.91,"to":390.25,"location":2,"content":"classification setup so we have assumed"},{"from":390.25,"to":393.28,"location":2,"content":"we have training data set where we have"},{"from":393.28,"to":397.93,"location":2,"content":"these vector X of our X points and then"},{"from":397.93,"to":400.71,"location":2,"content":"for each of one of them we have a class"},{"from":400.71,"to":404.23,"location":2,"content":"so the inputs might be words or"},{"from":404.23,"to":405.67,"location":2,"content":"sentences documents or"},{"from":405.67,"to":407.89,"location":2,"content":"something there a d-dimensional vector"},{"from":407.89,"to":411.67,"location":2,"content":"the weii are the labels or classes that"},{"from":411.67,"to":414.37,"location":2,"content":"we want to classify - and we've got a"},{"from":414.37,"to":416.32,"location":2,"content":"set of c classes that we're trying to"},{"from":416.32,"to":418.93,"location":2,"content":"predict and so those might be something"},{"from":418.93,"to":420.85,"location":2,"content":"like the topic of a document the"},{"from":420.85,"to":423.85,"location":2,"content":"sentiment positive or negative of a"},{"from":423.85,"to":426.04,"location":2,"content":"document or later we'll look a bit more"},{"from":426.04,"to":432.06,"location":2,"content":"named entities okay so if we have that"},{"from":432.06,"to":436.36,"location":2,"content":"for this sort of intuition is we've got"},{"from":436.36,"to":438.46,"location":2,"content":"this vector space of which we again have"},{"from":438.46,"to":441.52,"location":2,"content":"a 2d picture and we have points in that"},{"from":441.52,"to":443.95,"location":2,"content":"vector space which correspond to our X"},{"from":443.95,"to":447.76,"location":2,"content":"items and what we'd want to do is look"},{"from":447.76,"to":450.43,"location":2,"content":"at the ones and our training sample and"},{"from":450.43,"to":452.38,"location":2,"content":"see which ones are green and red for our"},{"from":452.38,"to":454.6,"location":2,"content":"two classes here and then we want to"},{"from":454.6,"to":457.6,"location":2,"content":"sort of learn a line that could divide"},{"from":457.6,"to":460.66,"location":2,"content":"between the green and the red ones as"},{"from":460.66,"to":463.09,"location":2,"content":"best as possible and that learn'd line"},{"from":463.09,"to":467.17,"location":2,"content":"is our classifier so on traditional"},{"from":467.17,"to":469.6,"location":2,"content":"machine learning or statistics we have"},{"from":469.6,"to":471.73,"location":2,"content":"the sort of X I vectors or our data"},{"from":471.73,"to":474.94,"location":2,"content":"items that are purely fixed but we're"},{"from":474.94,"to":480.75,"location":2,"content":"going to then multiply those X I by some"},{"from":480.75,"to":483.46,"location":2,"content":"estimated weight vector and that"},{"from":483.46,"to":485.92,"location":2,"content":"estimated weight vector will then go"},{"from":485.92,"to":488.5,"location":2,"content":"into a classification decision and the"},{"from":488.5,"to":490.75,"location":2,"content":"classifier that I'm showing here is a"},{"from":490.75,"to":493.54,"location":2,"content":"soft max classifier which is almost"},{"from":493.54,"to":495.55,"location":2,"content":"identical but not quite to a logistic"},{"from":495.55,"to":497.68,"location":2,"content":"regression classifier which you should"},{"from":497.68,"to":500.77,"location":2,"content":"have seen in CS 109 or a stats class or"},{"from":500.77,"to":502.78,"location":2,"content":"something like that which is giving a"},{"from":502.78,"to":506.95,"location":2,"content":"probability of different classes okay"},{"from":506.95,"to":510.16,"location":2,"content":"and in particular if you've got a soft"},{"from":510.16,"to":513.67,"location":2,"content":"max classifier or a logistic distich"},{"from":513.67,"to":515.8,"location":2,"content":"regression classifier these are what"},{"from":515.8,"to":518.2,"location":2,"content":"accordin linear classifiers so the"},{"from":518.2,"to":520.96,"location":2,"content":"decision boundary between two classes"},{"from":520.96,"to":524.8,"location":2,"content":"here is a line in some suitably high"},{"from":524.8,"to":526.78,"location":2,"content":"dimensional space so it's a plane or a"},{"from":526.78,"to":529.33,"location":2,"content":"hyperplane once you've got a bigger X"},{"from":529.33,"to":533.62,"location":2,"content":"vector okay so here's our soft max"},{"from":533.62,"to":536.53,"location":2,"content":"classifier and there are sort of two"},{"from":536.53,"to":539.47,"location":2,"content":"parts to that so in the"},{"from":539.47,"to":543.63,"location":2,"content":"in the weight matrix W we have a row"},{"from":543.63,"to":547.51,"location":2,"content":"corresponding to each class and then for"},{"from":547.51,"to":550.78,"location":2,"content":"that row we're sort of dot producting it"},{"from":550.78,"to":554.26,"location":2,"content":"with our data point vector X I and"},{"from":554.26,"to":557.14,"location":2,"content":"that's giving us a kind of a score for"},{"from":557.14,"to":559.42,"location":2,"content":"how likely it is that the example"},{"from":559.42,"to":561.94,"location":2,"content":"belongs to that class and then we're"},{"from":561.94,"to":564.16,"location":2,"content":"running that through a softmax function"},{"from":564.16,"to":567.28,"location":2,"content":"and just as we saw in week one this"},{"from":567.28,"to":569.98,"location":2,"content":"softmax takes a bunch of numbers and"},{"from":569.98,"to":571.3,"location":2,"content":"turns them into a probability"},{"from":571.3,"to":574,"location":2,"content":"distribution does that make sense to"},{"from":574,"to":575.74,"location":2,"content":"people people remember that from last"},{"from":575.74,"to":579.04,"location":2,"content":"week good so far okay"},{"from":579.04,"to":581.47,"location":2,"content":"I'm not going to go through this in"},{"from":581.47,"to":587.2,"location":2,"content":"detail but I mean essentially this is"},{"from":587.2,"to":589.89,"location":2,"content":"what a logistic regression does as well"},{"from":589.89,"to":593.95,"location":2,"content":"the difference is that here in this"},{"from":593.95,"to":599.62,"location":2,"content":"setup we have a weight vector for each"},{"from":599.62,"to":603.97,"location":2,"content":"class whereas what the statisticians do"},{"from":603.97,"to":607.09,"location":2,"content":"in logistic regression is they say wait"},{"from":607.09,"to":610.45,"location":2,"content":"that gives us one more number of weight"},{"from":610.45,"to":613,"location":2,"content":"vectors than we really need we can get"},{"from":613,"to":616.24,"location":2,"content":"away but for C classes we can get away"},{"from":616.24,"to":618.91,"location":2,"content":"with C minus one weight vectors so in"},{"from":618.91,"to":619.99,"location":2,"content":"particular if you're doing binary"},{"from":619.99,"to":622.36,"location":2,"content":"logistic regression you only need one"},{"from":622.36,"to":624.4,"location":2,"content":"weight vector whereas this softmax"},{"from":624.4,"to":626.44,"location":2,"content":"regression formulation you've actually"},{"from":626.44,"to":628.21,"location":2,"content":"got two weight vectors one for each"},{"from":628.21,"to":630.28,"location":2,"content":"class so there's that some little"},{"from":630.28,"to":632.08,"location":2,"content":"difference there which we could get into"},{"from":632.08,"to":634.45,"location":2,"content":"but basically the same let's just say"},{"from":634.45,"to":636.79,"location":2,"content":"it's we either doing softmax or logistic"},{"from":636.79,"to":640.33,"location":2,"content":"regression doesn't matter so when we're"},{"from":640.33,"to":644.38,"location":2,"content":"training what we want to do is we want"},{"from":644.38,"to":648.13,"location":2,"content":"to be able to predict the correct class"},{"from":648.13,"to":651.43,"location":2,"content":"and so the way we're going to do that is"},{"from":651.43,"to":653.23,"location":2,"content":"we're going to want to train our model"},{"from":653.23,"to":655.63,"location":2,"content":"so it gives us highest probability as"},{"from":655.63,"to":658.39,"location":2,"content":"possible to the correct class and"},{"from":658.39,"to":661.12,"location":2,"content":"therefore it'll give us lo a probability"},{"from":661.12,"to":665.35,"location":2,"content":"play as possible to the wrong classes"},{"from":665.35,"to":669.4,"location":2,"content":"and so our criterion for doing that is"},{"from":669.4,"to":672.31,"location":2,"content":"we're going to create this negative log"},{"from":672.31,"to":673.21,"location":2,"content":"probability"},{"from":673.21,"to":676.27,"location":2,"content":"the of our assignments and then we're"},{"from":676.27,"to":678.16,"location":2,"content":"going to want to minimize the negative"},{"from":678.16,"to":680.77,"location":2,"content":"log probability which corresponds to"},{"from":680.77,"to":683.38,"location":2,"content":"maximizing the log probability which"},{"from":683.38,"to":685.06,"location":2,"content":"corresponds to maximizing the"},{"from":685.06,"to":693.37,"location":2,"content":"probability and but sort of pretty soon"},{"from":693.37,"to":695.2,"location":2,"content":"now we're going to start doing more"},{"from":695.2,"to":697.54,"location":2,"content":"stuff with deep learning frameworks in"},{"from":697.54,"to":700.42,"location":2,"content":"particular PI torch and you can discover"},{"from":700.42,"to":702.37,"location":2,"content":"in that that there's actually a thing"},{"from":702.37,"to":704.98,"location":2,"content":"called nll loss which stands for"},{"from":704.98,"to":707.68,"location":2,"content":"negative log likelihood loss but"},{"from":707.68,"to":709.6,"location":2,"content":"basically no one uses that because the"},{"from":709.6,"to":711.79,"location":2,"content":"more convenient thing to use is what's"},{"from":711.79,"to":714.28,"location":2,"content":"called the cross entropy loss and so"},{"from":714.28,"to":715.99,"location":2,"content":"you'll hear everywhere that we're"},{"from":715.99,"to":718.18,"location":2,"content":"training with cross entropy loss so I"},{"from":718.18,"to":720.37,"location":2,"content":"just wanted to briefly mention that and"},{"from":720.37,"to":723.88,"location":2,"content":"explain what's going on there so the"},{"from":723.88,"to":726.73,"location":2,"content":"concept of cross entropy comes from baby"},{"from":726.73,"to":728.86,"location":2,"content":"information theory which is about the"},{"from":728.86,"to":731.38,"location":2,"content":"amount of information theory I know so"},{"from":731.38,"to":733.66,"location":2,"content":"we're assuming that there's some true"},{"from":733.66,"to":737.62,"location":2,"content":"probability distribution P and our model"},{"from":737.62,"to":739.45,"location":2,"content":"we've built some probability"},{"from":739.45,"to":741.73,"location":2,"content":"distribution Q that's what we've built"},{"from":741.73,"to":744.25,"location":2,"content":"with our softmax regression and we want"},{"from":744.25,"to":747.15,"location":2,"content":"to have a measure of whether our"},{"from":747.15,"to":749.74,"location":2,"content":"estimated probability distribution is"},{"from":749.74,"to":751.81,"location":2,"content":"the good one and the way we do it and"},{"from":751.81,"to":754.54,"location":2,"content":"cross entropy is we go through the"},{"from":754.54,"to":756.7,"location":2,"content":"classes and we say what's the"},{"from":756.7,"to":758.62,"location":2,"content":"probability of the class according to"},{"from":758.62,"to":761.89,"location":2,"content":"the true model using that weighting we"},{"from":761.89,"to":764.95,"location":2,"content":"then work out the log of the probability"},{"from":764.95,"to":768.34,"location":2,"content":"according to our estimated model and we"},{"from":768.34,"to":770.89,"location":2,"content":"sum those up and negate it and that is"},{"from":770.89,"to":776.65,"location":2,"content":"our cross entropy measure okay but so"},{"from":776.65,"to":781,"location":2,"content":"this in general gives you a measure of"},{"from":781,"to":785.22,"location":2,"content":"sort of information between"},{"from":785.22,"to":788.73,"location":2,"content":"distributions but in our particular case"},{"from":788.73,"to":791.95,"location":2,"content":"remember that for each example we've"},{"from":791.95,"to":794.05,"location":2,"content":"sort of assuming that this is a piece of"},{"from":794.05,"to":796.69,"location":2,"content":"label training data so we're saying for"},{"from":796.69,"to":799.48,"location":2,"content":"that example the right answer is class"},{"from":799.48,"to":802.9,"location":2,"content":"seven so therefore our true distribution"},{"from":802.9,"to":806.92,"location":2,"content":"our P is for this example"},{"from":806.92,"to":810.01,"location":2,"content":"class seven with probability one and its"},{"from":810.01,"to":811.21,"location":2,"content":"class"},{"from":811.21,"to":814.27,"location":2,"content":"anything else with probability zero so"},{"from":814.27,"to":816.58,"location":2,"content":"if you think about then what happens"},{"from":816.58,"to":818.44,"location":2,"content":"with this formula you've got this"},{"from":818.44,"to":821.2,"location":2,"content":"summation over all the classes but P of"},{"from":821.2,"to":823.6,"location":2,"content":"C is gonna be either one or zero and"},{"from":823.6,"to":826.12,"location":2,"content":"it's going to be one only for the true"},{"from":826.12,"to":828.94,"location":2,"content":"class here and so what you're left with"},{"from":828.94,"to":831.61,"location":2,"content":"is this is going to equal minus the log"},{"from":831.61,"to":836.5,"location":2,"content":"of you see for the true class which is"},{"from":836.5,"to":839.29,"location":2,"content":"sort of what we were then computing in"},{"from":839.29,"to":843.81,"location":2,"content":"the previous slide okay so that's what"},{"from":843.81,"to":846.37,"location":2,"content":"yeah so that's basically where you get"},{"from":846.37,"to":847.27,"location":2,"content":"with cross-entropy"},{"from":847.27,"to":852.28,"location":2,"content":"loss but one other concept to mention so"},{"from":852.28,"to":854.59,"location":2,"content":"when you have a full data set of a whole"},{"from":854.59,"to":857.53,"location":2,"content":"bunch of examples the cross-entropy loss"},{"from":857.53,"to":860.86,"location":2,"content":"is then taking the per example average"},{"from":860.86,"to":862.15,"location":2,"content":"so I guess that's what information"},{"from":862.15,"to":864.04,"location":2,"content":"theory people sometimes call the cross"},{"from":864.04,"to":866.71,"location":2,"content":"entropy rate so additionally factored in"},{"from":866.71,"to":868.51,"location":2,"content":"there if you're training it on in"},{"from":868.51,"to":871.57,"location":2,"content":"examples is that one on in factor that's"},{"from":871.57,"to":877.84,"location":2,"content":"coming in there okay okay so that's"},{"from":877.84,"to":882.21,"location":2,"content":"cross entropy loss that okay yeah"},{"from":882.21,"to":884.53,"location":2,"content":"there's some mixture of the actual"},{"from":884.53,"to":888.79,"location":2,"content":"levels in the ground sure right so the"},{"from":888.79,"to":892.54,"location":2,"content":"simplest case is that your gold data"},{"from":892.54,"to":895.72,"location":2,"content":"someone has Hanway water and they've"},{"from":895.72,"to":899.5,"location":2,"content":"labeled one and the rest of zero there"},{"from":899.5,"to":901.66,"location":2,"content":"you can think of cases where that isn't"},{"from":901.66,"to":904.03,"location":2,"content":"the case I mean one case is you could"},{"from":904.03,"to":905.95,"location":2,"content":"believe that human beings sometimes"},{"from":905.95,"to":908.74,"location":2,"content":"don't know the right answer so if human"},{"from":908.74,"to":910.57,"location":2,"content":"beings serum I'm not sure whether this"},{"from":910.57,"to":913.18,"location":2,"content":"should be class three or four you can"},{"from":913.18,"to":915.64,"location":2,"content":"imagine that we can make training data"},{"from":915.64,"to":917.95,"location":2,"content":"where we put probability 1/2 on both of"},{"from":917.95,"to":920.62,"location":2,"content":"them and that wouldn't be a crazy thing"},{"from":920.62,"to":923.8,"location":2,"content":"to do and so then you have a true cross"},{"from":923.8,"to":925.45,"location":2,"content":"entropy loss using more of a"},{"from":925.45,"to":929.83,"location":2,"content":"distribution the case where it's much"},{"from":929.83,"to":933.58,"location":2,"content":"more commonly used in actual practice is"},{"from":933.58,"to":936.67,"location":2,"content":"there are many circumstances in which"},{"from":936.67,"to":938.47,"location":2,"content":"people want to do semi-supervised"},{"from":938.47,"to":940.42,"location":2,"content":"learning so I guess"},{"from":940.42,"to":942.43,"location":2,"content":"a topic that both my group and Chris"},{"from":942.43,"to":944.38,"location":2,"content":"raised group have worked on quite a lot"},{"from":944.38,"to":946.66,"location":2,"content":"where we don't actually have fully"},{"from":946.66,"to":949.6,"location":2,"content":"labeled data but we've got some means of"},{"from":949.6,"to":952.06,"location":2,"content":"guessing what the labels of the data are"},{"from":952.06,"to":954.64,"location":2,"content":"and if we try to guess labels of data"},{"from":954.64,"to":957.16,"location":2,"content":"well then quite often we'll say here's"},{"from":957.16,"to":959.92,"location":2,"content":"this data item it's two-thirds chance as"},{"from":959.92,"to":961.45,"location":2,"content":"this label but it could be these other"},{"from":961.45,"to":963.97,"location":2,"content":"four labels and we'd use a probability"},{"from":963.97,"to":966.01,"location":2,"content":"distribution and yeah then it's more"},{"from":966.01,"to":972.03,"location":2,"content":"general cross-entropy loss okay right so"},{"from":972.03,"to":975.64,"location":2,"content":"that's cross-entropy loss pretty good"},{"from":975.64,"to":978.04,"location":2,"content":"with this bottom bits a little bit"},{"from":978.04,"to":980.56,"location":2,"content":"different which is the say whoa Nelly"},{"from":980.56,"to":982.63,"location":2,"content":"this is the sort of the full data set"},{"from":982.63,"to":986.2,"location":2,"content":"the other thing to notice when we have a"},{"from":986.2,"to":991.78,"location":2,"content":"fool that we can have a full data set of"},{"from":991.78,"to":996.4,"location":2,"content":"X's and then we have a full set of"},{"from":996.4,"to":1000.63,"location":2,"content":"weights we're here we're working a row"},{"from":1000.63,"to":1003.21,"location":2,"content":"vector for the weights for one class but"},{"from":1003.21,"to":1004.38,"location":2,"content":"we're going to work it out for all"},{"from":1004.38,"to":1007.47,"location":2,"content":"classes so we can sort of simplify what"},{"from":1007.47,"to":1009.15,"location":2,"content":"we're writing here and we can sort of"},{"from":1009.15,"to":1011.64,"location":2,"content":"use matrix notation and just work"},{"from":1011.64,"to":1016.74,"location":2,"content":"directly in terms of the matrix W okay"},{"from":1016.74,"to":1021.24,"location":2,"content":"so for traditional ML optimization our"},{"from":1021.24,"to":1025.98,"location":2,"content":"parameters are these sets of weights for"},{"from":1025.98,"to":1028.08,"location":2,"content":"the different classes so for each of the"},{"from":1028.08,"to":1032.55,"location":2,"content":"classes we have a d-dimensional row"},{"from":1032.55,"to":1034.56,"location":2,"content":"vector of weights because we're going to"},{"from":1034.56,"to":1036.56,"location":2,"content":"sort of dot product with our d"},{"from":1036.56,"to":1040.62,"location":2,"content":"dimensional input vector so we have C"},{"from":1040.62,"to":1046.07,"location":2,"content":"times D items and our W matrix and those"},{"from":1046.07,"to":1049.68,"location":2,"content":"the parameters of our model and so if we"},{"from":1049.68,"to":1052.59,"location":2,"content":"want to learn that model using the ideas"},{"from":1052.59,"to":1056.07,"location":2,"content":"of gradient descents it's the casted"},{"from":1056.07,"to":1058.17,"location":2,"content":"gradient descent we're going to do sort"},{"from":1058.17,"to":1059.85,"location":2,"content":"of what we started to talk about last"},{"from":1059.85,"to":1063.45,"location":2,"content":"time we have these set of parameters we"},{"from":1063.45,"to":1067.62,"location":2,"content":"work out the gradient the partial"},{"from":1067.62,"to":1071.91,"location":2,"content":"derivatives of all of these of the loss"},{"from":1071.91,"to":1073.96,"location":2,"content":"with respect to all of these parameter"},{"from":1073.96,"to":1076.78,"location":2,"content":"and we use that to get a gradient update"},{"from":1076.78,"to":1079.84,"location":2,"content":"on our loss function and we move around"},{"from":1079.84,"to":1082.84,"location":2,"content":"the W's and moving around the W's"},{"from":1082.84,"to":1086.08,"location":2,"content":"corresponds to sort of moving this line"},{"from":1086.08,"to":1088.42,"location":2,"content":"that separates between the classes and"},{"from":1088.42,"to":1091.21,"location":2,"content":"we fiddle that around so as to minimize"},{"from":1091.21,"to":1094.45,"location":2,"content":"our loss which corresponds to choosing a"},{"from":1094.45,"to":1097.78,"location":2,"content":"line that best separates between the"},{"from":1097.78,"to":1102.91,"location":2,"content":"items of the classes in some sense okay"},{"from":1102.91,"to":1105.85,"location":2,"content":"so that's a basic classifier so the"},{"from":1105.85,"to":1110.35,"location":2,"content":"first question is well how are things"},{"from":1110.35,"to":1113.29,"location":2,"content":"going to be different with a neural"},{"from":1113.29,"to":1118.03,"location":2,"content":"network classifier and so the essential"},{"from":1118.03,"to":1121.99,"location":2,"content":"observation is that sort of most of the"},{"from":1121.99,"to":1125.17,"location":2,"content":"classic classifiers that people used a"},{"from":1125.17,"to":1127.3,"location":2,"content":"lot of the time so that includes things"},{"from":1127.3,"to":1131.23,"location":2,"content":"like naive Bayes models basic support"},{"from":1131.23,"to":1134.2,"location":2,"content":"vector machines softmax or logistic"},{"from":1134.2,"to":1138.04,"location":2,"content":"regressions they're sort of fairly"},{"from":1138.04,"to":1141.94,"location":2,"content":"simple classifiers in particular those"},{"from":1141.94,"to":1144.31,"location":2,"content":"are all linear classifiers which are"},{"from":1144.31,"to":1147.1,"location":2,"content":"going to classify by drawing a line or"},{"from":1147.1,"to":1148.93,"location":2,"content":"in the high dimensional space by drawing"},{"from":1148.93,"to":1150.94,"location":2,"content":"some kind of plane that separates"},{"from":1150.94,"to":1153.82,"location":2,"content":"examples and having a simple classifier"},{"from":1153.82,"to":1156.84,"location":2,"content":"like that can be useful in certain"},{"from":1156.84,"to":1159.37,"location":2,"content":"circumstances I mean that gives you what"},{"from":1159.37,"to":1161.14,"location":2,"content":"in machine learning as a high biased"},{"from":1161.14,"to":1163.6,"location":2,"content":"classifiers there's lots of talk of in"},{"from":1163.6,"to":1167.02,"location":2,"content":"CS 229 but if you have a data set that's"},{"from":1167.02,"to":1170.41,"location":2,"content":"like this you can't do a very good job"},{"from":1170.41,"to":1173.23,"location":2,"content":"at classifying all the points correctly"},{"from":1173.23,"to":1176.02,"location":2,"content":"if you have a high bias classifier"},{"from":1176.02,"to":1178.27,"location":2,"content":"because you're gonna only draw a line so"},{"from":1178.27,"to":1180.13,"location":2,"content":"you'd like to have a more powerful"},{"from":1180.13,"to":1184.15,"location":2,"content":"classifier and essentially what's been"},{"from":1184.15,"to":1186.55,"location":2,"content":"powering a lot of the use of deep"},{"from":1186.55,"to":1189.82,"location":2,"content":"learning is that in a lot of cases when"},{"from":1189.82,"to":1192.13,"location":2,"content":"you have natural signals so those are"},{"from":1192.13,"to":1195.43,"location":2,"content":"things like speech language images and"},{"from":1195.43,"to":1198.28,"location":2,"content":"things like that you have a ton of data"},{"from":1198.28,"to":1200.79,"location":2,"content":"so you could learn a quite sophisticated"},{"from":1200.79,"to":1206.17,"location":2,"content":"classifier but representing the classes"},{"from":1206.17,"to":1207.79,"location":2,"content":"in terms of the input"},{"from":1207.79,"to":1210.4,"location":2,"content":"data is sort of very complex you could"},{"from":1210.4,"to":1212.05,"location":2,"content":"never do it by just drawing a line"},{"from":1212.05,"to":1215.2,"location":2,"content":"between the two classes and so you'd"},{"from":1215.2,"to":1217.63,"location":2,"content":"like to use some more complicated kind"},{"from":1217.63,"to":1221.23,"location":2,"content":"of classifier and so neural networks the"},{"from":1221.23,"to":1223,"location":2,"content":"multi-layer neural networks that were"},{"from":1223,"to":1224.58,"location":2,"content":"going to be staying to get into now"},{"from":1224.58,"to":1228.19,"location":2,"content":"precisely what they do is provide your"},{"from":1228.19,"to":1231.73,"location":2,"content":"way to learn very complex you know"},{"from":1231.73,"to":1235.78,"location":2,"content":"almost limitless in fact classifiers so"},{"from":1235.78,"to":1238.06,"location":2,"content":"that if you look at the decisions that"},{"from":1238.06,"to":1240.07,"location":2,"content":"they're making in terms of the original"},{"from":1240.07,"to":1243.1,"location":2,"content":"space they can be learning cases like"},{"from":1243.1,"to":1248.47,"location":2,"content":"this I put the I put the pointer on a"},{"from":1248.47,"to":1252.31,"location":2,"content":"couple of the slides here this this is a"},{"from":1252.31,"to":1254.68,"location":2,"content":"visualization that was done by andraka"},{"from":1254.68,"to":1257.2,"location":2,"content":"pothi he was a PhD student here until a"},{"from":1257.2,"to":1259.24,"location":2,"content":"couple of years ago so this is a little"},{"from":1259.24,"to":1262.18,"location":2,"content":"JavaScript app that you can find off his"},{"from":1262.18,"to":1264.22,"location":2,"content":"website and it's actually a lot of fun"},{"from":1264.22,"to":1266.31,"location":2,"content":"to play with to see what kind of"},{"from":1266.31,"to":1268.99,"location":2,"content":"decision boundaries you can get a neural"},{"from":1268.99,"to":1275.88,"location":2,"content":"net to come up with okay so for getting"},{"from":1275.88,"to":1279.64,"location":2,"content":"for getting more advanced classification"},{"from":1279.64,"to":1283.57,"location":2,"content":"out of a neural net used for natural"},{"from":1283.57,"to":1287.08,"location":2,"content":"language there are sort of two things"},{"from":1287.08,"to":1289.93,"location":2,"content":"going that you can do that I want to"},{"from":1289.93,"to":1292.3,"location":2,"content":"talk about which are in in some sense"},{"from":1292.3,"to":1295.06,"location":2,"content":"the same thing when it comes down to it"},{"from":1295.06,"to":1297.19,"location":2,"content":"but I'll sort of mention them separately"},{"from":1297.19,"to":1300.66,"location":2,"content":"at the beginning that one of them is"},{"from":1300.66,"to":1305.29,"location":2,"content":"that we have these word vectors and then"},{"from":1305.29,"to":1307.36,"location":2,"content":"the second one is that we're going to"},{"from":1307.36,"to":1310.36,"location":2,"content":"build deeper multi-layer networks okay"},{"from":1310.36,"to":1313.42,"location":2,"content":"so at first crucial difference that we"},{"from":1313.42,"to":1317.02,"location":2,"content":"already started to see with what we were"},{"from":1317.02,"to":1320.05,"location":2,"content":"doing last week is rather than sort of"},{"from":1320.05,"to":1322.21,"location":2,"content":"having a word being this is the word"},{"from":1322.21,"to":1326.71,"location":2,"content":"house we instead say house is a vector"},{"from":1326.71,"to":1330.33,"location":2,"content":"of real numbers and what we can do is"},{"from":1330.33,"to":1333.37,"location":2,"content":"change the vector that corresponds to"},{"from":1333.37,"to":1337.57,"location":2,"content":"house in such a way as we can build"},{"from":1337.57,"to":1339.67,"location":2,"content":"better classifiers which means that"},{"from":1339.67,"to":1341.56,"location":2,"content":"we're going to be sort of moving how"},{"from":1341.56,"to":1344.05,"location":2,"content":"as representation around the space to"},{"from":1344.05,"to":1345.76,"location":2,"content":"capture things that were interested in"},{"from":1345.76,"to":1348.1,"location":2,"content":"like word similarity analogies and"},{"from":1348.1,"to":1350.92,"location":2,"content":"things like that so this is actually you"},{"from":1350.92,"to":1353.37,"location":2,"content":"know kind of a weird idea compared to"},{"from":1353.37,"to":1356.68,"location":2,"content":"conventional steps or m/l so rather than"},{"from":1356.68,"to":1360.25,"location":2,"content":"saying we just have the parameters W we"},{"from":1360.25,"to":1363.33,"location":2,"content":"also say that all of these word"},{"from":1363.33,"to":1366.31,"location":2,"content":"representations are also parameters of"},{"from":1366.31,"to":1368.52,"location":2,"content":"our model so we're actually going to"},{"from":1368.52,"to":1372.31,"location":2,"content":"change the representations of words to"},{"from":1372.31,"to":1374.62,"location":2,"content":"allow our classifiers to do better"},{"from":1374.62,"to":1376.57,"location":2,"content":"so we're simultaneously changing the"},{"from":1376.57,"to":1378.46,"location":2,"content":"weights and we're changing the"},{"from":1378.46,"to":1380.56,"location":2,"content":"representation of words and we're"},{"from":1380.56,"to":1382.75,"location":2,"content":"optimizing both of them at once to try"},{"from":1382.75,"to":1385.57,"location":2,"content":"and make our model as good as possible"},{"from":1385.57,"to":1389.29,"location":2,"content":"and so this is the sense in which people"},{"from":1389.29,"to":1390.67,"location":2,"content":"often talk about for deep learning"},{"from":1390.67,"to":1393.55,"location":2,"content":"models that we're doing representation"},{"from":1393.55,"to":1397.21,"location":2,"content":"learning all right I sort of said there"},{"from":1397.21,"to":1399.34,"location":2,"content":"are two ways I was going to mention two"},{"from":1399.34,"to":1402.13,"location":2,"content":"things ones this sort of word vector"},{"from":1402.13,"to":1404.02,"location":2,"content":"representation learning and then the"},{"from":1404.02,"to":1406.12,"location":2,"content":"second one is that we're going to start"},{"from":1406.12,"to":1407.89,"location":2,"content":"looking at deeper multi-layer neural"},{"from":1407.89,"to":1411.94,"location":2,"content":"networks sort of hidden over here on the"},{"from":1411.94,"to":1415.24,"location":2,"content":"slide is the observation that really you"},{"from":1415.24,"to":1418.66,"location":2,"content":"can think of would word vector embedding"},{"from":1418.66,"to":1422.05,"location":2,"content":"as just putting your having a model with"},{"from":1422.05,"to":1424.84,"location":2,"content":"one more neural network layer so if you"},{"from":1424.84,"to":1429.1,"location":2,"content":"imagine that each word was a one hot"},{"from":1429.1,"to":1432.49,"location":2,"content":"vector with four the different word"},{"from":1432.49,"to":1434.92,"location":2,"content":"types in your model so you had a you"},{"from":1434.92,"to":1437.74,"location":2,"content":"know 150,000 dimensional vector with"},{"from":1437.74,"to":1439.69,"location":2,"content":"this or one hot encoding of different"},{"from":1439.69,"to":1442.63,"location":2,"content":"words then you could say you have a map"},{"from":1442.63,"to":1446.71,"location":2,"content":"a matrix L which is sort of your lexicon"},{"from":1446.71,"to":1451.06,"location":2,"content":"matrix and you will pass your one hot"},{"from":1451.06,"to":1454.36,"location":2,"content":"vector for a word through a layer of"},{"from":1454.36,"to":1457.12,"location":2,"content":"neural net which multiplies the one hot"},{"from":1457.12,"to":1460.9,"location":2,"content":"vector or L and the one hot vector and"},{"from":1460.9,"to":1463.36,"location":2,"content":"since this is a one hot vector what that"},{"from":1463.36,"to":1466.39,"location":2,"content":"will have the effect of doing is taking"},{"from":1466.39,"to":1471.73,"location":2,"content":"out a column of L and so really we've"},{"from":1471.73,"to":1475.06,"location":2,"content":"got an extra layer of matrix in our new"},{"from":1475.06,"to":1475.45,"location":2,"content":"own"},{"from":1475.45,"to":1478.21,"location":2,"content":"and we're learning the parameters of"},{"from":1478.21,"to":1480.67,"location":2,"content":"that matrix in the same way as we're"},{"from":1480.67,"to":1483.7,"location":2,"content":"learning a deep neural network for other"},{"from":1483.7,"to":1486.31,"location":2,"content":"purposes so mathematically that"},{"from":1486.31,"to":1488.35,"location":2,"content":"completely makes sense and that's sort"},{"from":1488.35,"to":1491.95,"location":2,"content":"of a sensible way to think about what"},{"from":1491.95,"to":1495.01,"location":2,"content":"you're doing with word embeddings and"},{"from":1495.01,"to":1497.86,"location":2,"content":"neural networks and implementation wise"},{"from":1497.86,"to":1500.71,"location":2,"content":"this makes no sense at all and no one"},{"from":1500.71,"to":1502.18,"location":2,"content":"does this because it just doesn't make"},{"from":1502.18,"to":1504.73,"location":2,"content":"sense to do a matrix multiply when the"},{"from":1504.73,"to":1507.1,"location":2,"content":"result of the matrix multiply will be"},{"from":1507.1,"to":1512.2,"location":2,"content":"okay this is word ID seventeen sort of"},{"from":1512.2,"to":1514.33,"location":2,"content":"then constructing a one hot vector of"},{"from":1514.33,"to":1516.97,"location":2,"content":"length 150,000 with the one in position"},{"from":1516.97,"to":1519.13,"location":2,"content":"17 and then doing a matrix multiply it"},{"from":1519.13,"to":1522.04,"location":2,"content":"makes no sense you just take out the"},{"from":1522.04,"to":1524.77,"location":2,"content":"column or or the row as we've discussed"},{"from":1524.77,"to":1527.38,"location":2,"content":"seventeen of your matrix and that's what"},{"from":1527.38,"to":1530.29,"location":2,"content":"everyone actually does okay"},{"from":1530.29,"to":1533.71,"location":2,"content":"here's my one obligatory picture of"},{"from":1533.71,"to":1536.62,"location":2,"content":"neurons for the class so don't miss it"},{"from":1536.62,"to":1537.94,"location":2,"content":"I'm not gonna show it again in all class"},{"from":1537.94,"to":1541.83,"location":2,"content":"okay so the origins of neural networks"},{"from":1541.83,"to":1546.61,"location":2,"content":"was in some sense to try and construct"},{"from":1546.61,"to":1550.45,"location":2,"content":"an artificial neuron that seemed to in"},{"from":1550.45,"to":1553.27,"location":2,"content":"some sense kind of capture the kind of"},{"from":1553.27,"to":1557.05,"location":2,"content":"computations that go on in human brains"},{"from":1557.05,"to":1560.8,"location":2,"content":"and it's a very loose analogy for what"},{"from":1560.8,"to":1563.2,"location":2,"content":"was produced but you know our model here"},{"from":1563.2,"to":1566.26,"location":2,"content":"is these are our this is a teeny part of"},{"from":1566.26,"to":1569.05,"location":2,"content":"our human brain so here are neurons this"},{"from":1569.05,"to":1572.53,"location":2,"content":"is a neuron cell here and so what does a"},{"from":1572.53,"to":1576.64,"location":2,"content":"neuron consist of so up the back it's"},{"from":1576.64,"to":1578.86,"location":2,"content":"got these dendrites lots of dendrites"},{"from":1578.86,"to":1581.77,"location":2,"content":"then it's got a cell body and if there's"},{"from":1581.77,"to":1585.25,"location":2,"content":"stuff coming in on the dendrites the"},{"from":1585.25,"to":1588.04,"location":2,"content":"cell body will become active and then"},{"from":1588.04,"to":1590.32,"location":2,"content":"it'll all start spiking down this long"},{"from":1590.32,"to":1593.32,"location":2,"content":"thing which is called the axon and so"},{"from":1593.32,"to":1596.2,"location":2,"content":"then these axons lead to the dendrites"},{"from":1596.2,"to":1598.48,"location":2,"content":"of a different cell or lots of different"},{"from":1598.48,"to":1600.28,"location":2,"content":"cells right this one"},{"from":1600.28,"to":1602.5,"location":2,"content":"that's right shown but some of these are"},{"from":1602.5,"to":1605.29,"location":2,"content":"kind of going to different cells and so"},{"from":1605.29,"to":1608.62,"location":2,"content":"you then have these sort of terminal"},{"from":1608.62,"to":1609.31,"location":2,"content":"buttons"},{"from":1609.31,"to":1611.23,"location":2,"content":"the Exxon which are kind of close to the"},{"from":1611.23,"to":1613.15,"location":2,"content":"dendrites but have a little gap in them"},{"from":1613.15,"to":1615.97,"location":2,"content":"and some minute miracles of biochemistry"},{"from":1615.97,"to":1618.25,"location":2,"content":"happen there and so that's the synapse"},{"from":1618.25,"to":1620.44,"location":2,"content":"across which you'll then have sort of"},{"from":1620.44,"to":1622.81,"location":2,"content":"activation flowing which goes into the"},{"from":1622.81,"to":1626.25,"location":2,"content":"next neuron so that was the starting off"},{"from":1626.25,"to":1628.54,"location":2,"content":"model that people wanted to try and"},{"from":1628.54,"to":1631.45,"location":2,"content":"simulate in computation so people came"},{"from":1631.45,"to":1634.03,"location":2,"content":"up with this model of an artificial"},{"from":1634.03,"to":1637.81,"location":2,"content":"neuron so that we have things coming in"},{"from":1637.81,"to":1640.77,"location":2,"content":"from other neurons at some level of"},{"from":1640.77,"to":1644.17,"location":2,"content":"activation so that's a number X 0 X 1 X"},{"from":1644.17,"to":1649.59,"location":2,"content":"2 then sine APS's vary depending on how"},{"from":1649.59,"to":1652.36,"location":2,"content":"excitable they are as to how easily"},{"from":1652.36,"to":1654.7,"location":2,"content":"they'll let signal cross across the"},{"from":1654.7,"to":1657.54,"location":2,"content":"synapse and so that's being modeled by"},{"from":1657.54,"to":1661.72,"location":2,"content":"multiplying them by a weight W 0 W 1 W 2"},{"from":1661.72,"to":1666.01,"location":2,"content":"and then the cell body sort of correctly"},{"from":1666.01,"to":1668.91,"location":2,"content":"is sort of summing this amount of"},{"from":1668.91,"to":1670.78,"location":2,"content":"excitation it's getting from the"},{"from":1670.78,"to":1675.55,"location":2,"content":"different dendrites and then it can have"},{"from":1675.55,"to":1677.74,"location":2,"content":"its own bias as to how likely it is to"},{"from":1677.74,"to":1681.28,"location":2,"content":"fire that's the B so we get that and"},{"from":1681.28,"to":1683.29,"location":2,"content":"then it has some overall kind of"},{"from":1683.29,"to":1686.5,"location":2,"content":"threshold or propensity for firing so we"},{"from":1686.5,"to":1687.97,"location":2,"content":"sort of stick it through an activation"},{"from":1687.97,"to":1691.87,"location":2,"content":"function which is sort of will determine"},{"from":1691.87,"to":1694.48,"location":2,"content":"a firing rate and that will be the"},{"from":1694.48,"to":1696.37,"location":2,"content":"signal that's going out on the output"},{"from":1696.37,"to":1698.77,"location":2,"content":"axon so that was sort of the starting"},{"from":1698.77,"to":1702.79,"location":2,"content":"point of that but you know really for"},{"from":1702.79,"to":1705.01,"location":2,"content":"what we've ended up computing we just"},{"from":1705.01,"to":1706.75,"location":2,"content":"have a little bit of baby math here"},{"from":1706.75,"to":1710.53,"location":2,"content":"which actually looks very familiar to"},{"from":1710.53,"to":1713.77,"location":2,"content":"the kind of baby math you see in linear"},{"from":1713.77,"to":1716.53,"location":2,"content":"algebra and statistics and so it's"},{"from":1716.53,"to":1720.18,"location":2,"content":"really no different so in particular a"},{"from":1720.18,"to":1724.39,"location":2,"content":"neuron can very easily be a binary"},{"from":1724.39,"to":1728.68,"location":2,"content":"logistic regression unit so that this is"},{"from":1728.68,"to":1731.02,"location":2,"content":"sort of for a logistic regression you're"},{"from":1731.02,"to":1732.88,"location":2,"content":"taking for your input X you're"},{"from":1732.88,"to":1735.01,"location":2,"content":"multiplying it by weight vector you're"},{"from":1735.01,"to":1740.23,"location":2,"content":"adding your bias term and then you're"},{"from":1740.23,"to":1742.81,"location":2,"content":"putting it through and"},{"from":1742.81,"to":1746.77,"location":2,"content":"linearity like the logistic function and"},{"from":1746.77,"to":1749.5,"location":2,"content":"then so you're calculating a logistic"},{"from":1749.5,"to":1753.58,"location":2,"content":"regression inside this sort of neuron"},{"from":1753.58,"to":1758.17,"location":2,"content":"model and so this is the this is the"},{"from":1758.17,"to":1759.91,"location":2,"content":"difference between the softmax and the"},{"from":1759.91,"to":1761.95,"location":2,"content":"logistic regression as I saying that"},{"from":1761.95,"to":1764.83,"location":2,"content":"whereas the softmax for two classes has"},{"from":1764.83,"to":1767.41,"location":2,"content":"two sets of parameters this sort of just"},{"from":1767.41,"to":1769.9,"location":2,"content":"has one set of parameters z and you're"},{"from":1769.9,"to":1771.91,"location":2,"content":"modeling the two classes by giving the"},{"from":1771.91,"to":1774.91,"location":2,"content":"probability of one class from zero to"},{"from":1774.91,"to":1777.52,"location":2,"content":"one depending on where the input to the"},{"from":1777.52,"to":1779.23,"location":2,"content":"logistic regression is highly negative"},{"from":1779.23,"to":1783.73,"location":2,"content":"or highly positive okay so really we can"},{"from":1783.73,"to":1786.1,"location":2,"content":"just say these artificial neurons are"},{"from":1786.1,"to":1789.34,"location":2,"content":"sort of like binary logistic regression"},{"from":1789.34,"to":1791.98,"location":2,"content":"units or we can make variants of binary"},{"from":1791.98,"to":1794.95,"location":2,"content":"logistic regression units by using some"},{"from":1794.95,"to":1797.35,"location":2,"content":"different F function and we'll come back"},{"from":1797.35,"to":1802.72,"location":2,"content":"to that again pretty soon okay well so"},{"from":1802.72,"to":1805.36,"location":2,"content":"that gives us one neuron so one neuron"},{"from":1805.36,"to":1808.09,"location":2,"content":"is a logistic regression unit for"},{"from":1808.09,"to":1810.64,"location":2,"content":"current purposes so crucially what we're"},{"from":1810.64,"to":1812.35,"location":2,"content":"wanting to do with neural networks to"},{"from":1812.35,"to":1815.56,"location":2,"content":"say well why only run one logistic"},{"from":1815.56,"to":1818.89,"location":2,"content":"regression why don't we run a whole"},{"from":1818.89,"to":1820.78,"location":2,"content":"bunch of logistic regressions at the"},{"from":1820.78,"to":1824.05,"location":2,"content":"same time so you know here our inputs"},{"from":1824.05,"to":1826.15,"location":2,"content":"and here's our little logistic"},{"from":1826.15,"to":1829,"location":2,"content":"regression unit but we could run three"},{"from":1829,"to":1831.88,"location":2,"content":"logistic regressions at the same time or"},{"from":1831.88,"to":1835.42,"location":2,"content":"we can run any number of them well"},{"from":1835.42,"to":1838.6,"location":2,"content":"that's good but sort of for conventional"},{"from":1838.6,"to":1841.63,"location":2,"content":"training of a statistical model we'd"},{"from":1841.63,"to":1844.27,"location":2,"content":"sort of have to determine for those"},{"from":1844.27,"to":1846.82,"location":2,"content":"orange outputs of the logistic"},{"from":1846.82,"to":1849.16,"location":2,"content":"regression you know what we're training"},{"from":1849.16,"to":1851.68,"location":2,"content":"each of them to try and capture with"},{"from":1851.68,"to":1854.32,"location":2,"content":"have to have data to predict what"},{"from":1854.32,"to":1857.04,"location":2,"content":"they're going to try and capture and so"},{"from":1857.04,"to":1859.75,"location":2,"content":"the secret of sort of then building"},{"from":1859.75,"to":1863.23,"location":2,"content":"began neural networks is to say we don't"},{"from":1863.23,"to":1865.42,"location":2,"content":"actually want to decide ahead of time"},{"from":1865.42,"to":1869.14,"location":2,"content":"what those little orange logistic"},{"from":1869.14,"to":1871.99,"location":2,"content":"regressions are trying to capture we"},{"from":1871.99,"to":1874.81,"location":2,"content":"want the neural network to self organize"},{"from":1874.81,"to":1875.99,"location":2,"content":"so that"},{"from":1875.99,"to":1879.71,"location":2,"content":"those orange logistic regression units"},{"from":1879.71,"to":1883.43,"location":2,"content":"learn something useful and well what is"},{"from":1883.43,"to":1884.9,"location":2,"content":"something useful"},{"from":1884.9,"to":1887.99,"location":2,"content":"well our idea is to say we do actually"},{"from":1887.99,"to":1891.77,"location":2,"content":"have some tasks that we want to do so we"},{"from":1891.77,"to":1894.83,"location":2,"content":"we have some tasks that we want to do so"},{"from":1894.83,"to":1897.41,"location":2,"content":"maybe we want to sort of decide whether"},{"from":1897.41,"to":1899.39,"location":2,"content":"a movie review is positive or negative"},{"from":1899.39,"to":1901.01,"location":2,"content":"something like sentiment analysis or"},{"from":1901.01,"to":1902.48,"location":2,"content":"something like that there is something"},{"from":1902.48,"to":1905.09,"location":2,"content":"we want to do at the end of the day and"},{"from":1905.09,"to":1908.6,"location":2,"content":"we're going to have a logistic"},{"from":1908.6,"to":1910.58,"location":2,"content":"regression classifier they're telling us"},{"from":1910.58,"to":1914.27,"location":2,"content":"positive or negative but the inputs to"},{"from":1914.27,"to":1916.22,"location":2,"content":"that aren't going to directly be"},{"from":1916.22,"to":1918.05,"location":2,"content":"something like words in the document"},{"from":1918.05,"to":1920.36,"location":2,"content":"they're going to be this intermediate"},{"from":1920.36,"to":1923.39,"location":2,"content":"layer of logistic regression units and"},{"from":1923.39,"to":1926.42,"location":2,"content":"we're going to train this whole thing to"},{"from":1926.42,"to":1929.38,"location":2,"content":"minimize our cross entropy lost and"},{"from":1929.38,"to":1931.79,"location":2,"content":"essentially what we're going to want to"},{"from":1931.79,"to":1933.05,"location":2,"content":"have happen and the backpropagation"},{"from":1933.05,"to":1936.59,"location":2,"content":"algorithm will do for us is to say you"},{"from":1936.59,"to":1939.17,"location":2,"content":"things in the middle it's your job to"},{"from":1939.17,"to":1942.98,"location":2,"content":"find some useful way to calculate values"},{"from":1942.98,"to":1945.47,"location":2,"content":"from the underlying data such that it'll"},{"from":1945.47,"to":1948.59,"location":2,"content":"help our final classifier make a good"},{"from":1948.59,"to":1951.38,"location":2,"content":"decision and I mean in particular you"},{"from":1951.38,"to":1954.77,"location":2,"content":"know back to this picture you know the"},{"from":1954.77,"to":1957.29,"location":2,"content":"final classifier it's just a linear"},{"from":1957.29,"to":1959.93,"location":2,"content":"classifier a soft max or a logistic"},{"from":1959.93,"to":1961.73,"location":2,"content":"regression it's going to have a line"},{"from":1961.73,"to":1964.24,"location":2,"content":"like this but if the intermediate"},{"from":1964.24,"to":1967.28,"location":2,"content":"classifiers they are like a word"},{"from":1967.28,"to":1968.99,"location":2,"content":"embedding they can kind of sort of"},{"from":1968.99,"to":1971.3,"location":2,"content":"re-rent the space and shift things"},{"from":1971.3,"to":1974.66,"location":2,"content":"around so they can learn to shift things"},{"from":1974.66,"to":1977.51,"location":2,"content":"around in such a way as you're learning"},{"from":1977.51,"to":1980.21,"location":2,"content":"a highly nonlinear function of the"},{"from":1980.21,"to":1990.08,"location":2,"content":"original input space okay and so at that"},{"from":1990.08,"to":1992.3,"location":2,"content":"point it's simply a matter of saying"},{"from":1992.3,"to":1995.09,"location":2,"content":"well why stop there maybe you'd get even"},{"from":1995.09,"to":1998.57,"location":2,"content":"better if we put in more layers and this"},{"from":1998.57,"to":2001.18,"location":2,"content":"sort of gets us into the area of deep"},{"from":2001.18,"to":2005.82,"location":2,"content":"learning and sort of precisely this is"},{"from":2005.82,"to":2008.98,"location":2,"content":"that sort of there was they've sort of"},{"from":2008.98,"to":2009.73,"location":2,"content":"been three"},{"from":2009.73,"to":2011.77,"location":2,"content":"cummings of neural networks as the first"},{"from":2011.77,"to":2014.56,"location":2,"content":"work in the 50s which is essentially"},{"from":2014.56,"to":2017.74,"location":2,"content":"when people had a model of a single"},{"from":2017.74,"to":2020.38,"location":2,"content":"neuron like this and then only gradually"},{"from":2020.38,"to":2022.9,"location":2,"content":"worked out how it related to more"},{"from":2022.9,"to":2026.55,"location":2,"content":"conventional statistics and there was"},{"from":2026.55,"to":2030.19,"location":2,"content":"the second version of neural networks"},{"from":2030.19,"to":2033.34,"location":2,"content":"which saw the 80s and early 90s where"},{"from":2033.34,"to":2036.13,"location":2,"content":"people built neural networks like this"},{"from":2036.13,"to":2039.12,"location":2,"content":"that had this one hidden layer where a"},{"from":2039.12,"to":2041.08,"location":2,"content":"representation could be learned in the"},{"from":2041.08,"to":2044.77,"location":2,"content":"middle but at that time it really wasn't"},{"from":2044.77,"to":2048.1,"location":2,"content":"effective of or people weren't able to"},{"from":2048.1,"to":2051.79,"location":2,"content":"build deeper networks and get them to do"},{"from":2051.79,"to":2053.8,"location":2,"content":"anything useful so you sort of had these"},{"from":2053.8,"to":2056.2,"location":2,"content":"neural networks at one hidden layers and"},{"from":2056.2,"to":2059.35,"location":2,"content":"so precisely with research that started"},{"from":2059.35,"to":2062.83,"location":2,"content":"in into deep learning that precisely the"},{"from":2062.83,"to":2066.7,"location":2,"content":"motivating question is we believe we'll"},{"from":2066.7,"to":2070.05,"location":2,"content":"be able to do even more sophisticated"},{"from":2070.05,"to":2072.88,"location":2,"content":"classification for more complex tasks"},{"from":2072.88,"to":2074.92,"location":2,"content":"things like speech recognition and image"},{"from":2074.92,"to":2077.95,"location":2,"content":"recognition if we could have a deeper"},{"from":2077.95,"to":2080.83,"location":2,"content":"network which will be able to more"},{"from":2080.83,"to":2083.5,"location":2,"content":"effectively learn more sophisticated"},{"from":2083.5,"to":2085.45,"location":2,"content":"functions of the input which will allow"},{"from":2085.45,"to":2088.48,"location":2,"content":"us to do things like recognize sounds of"},{"from":2088.48,"to":2090.88,"location":2,"content":"a language how could we possibly train"},{"from":2090.88,"to":2094.06,"location":2,"content":"such a network so it all works"},{"from":2094.06,"to":2096.4,"location":2,"content":"effectively and that's the kind of thing"},{"from":2096.4,"to":2100.3,"location":2,"content":"we'll go on to more so starting this"},{"from":2100.3,"to":2103.48,"location":2,"content":"lecture more so in the next lecture but"},{"from":2103.48,"to":2107.05,"location":2,"content":"before we get to there just to underline"},{"from":2107.05,"to":2110.2,"location":2,"content":"it again so once we have something like"},{"from":2110.2,"to":2113.92,"location":2,"content":"this as our layer of a neural network we"},{"from":2113.92,"to":2117.4,"location":2,"content":"have a vector of inputs we have a vector"},{"from":2117.4,"to":2121.44,"location":2,"content":"of outputs and everything is connected"},{"from":2121.44,"to":2124.75,"location":2,"content":"so that we've got this sort of weights"},{"from":2124.75,"to":2127.99,"location":2,"content":"along every one of these black lines and"},{"from":2127.99,"to":2131.68,"location":2,"content":"so we can say a1 is you're taking"},{"from":2131.68,"to":2134.95,"location":2,"content":"weights times each component of x1 and"},{"from":2134.95,"to":2139.27,"location":2,"content":"adding a biased and then you're going to"},{"from":2139.27,"to":2141.82,"location":2,"content":"be running which is sort of this part"},{"from":2141.82,"to":2143.62,"location":2,"content":"and then running it through"},{"from":2143.62,"to":2146.53,"location":2,"content":"our non-linearity and that will give us"},{"from":2146.53,"to":2148.54,"location":2,"content":"an output and we're going to do that for"},{"from":2148.54,"to":2153.73,"location":2,"content":"each of a 1 a 2 and a 3 so again we can"},{"from":2153.73,"to":2156.67,"location":2,"content":"kind of regard a is a vector and we can"},{"from":2156.67,"to":2159.07,"location":2,"content":"kind of collapse it into this matrix"},{"from":2159.07,"to":2161.65,"location":2,"content":"notation for working out the fix of"},{"from":2161.65,"to":2163.87,"location":2,"content":"layers so fully connected layers are"},{"from":2163.87,"to":2167.65,"location":2,"content":"effectively matrices of weights and"},{"from":2167.65,"to":2169.96,"location":2,"content":"commonly we write them like this where"},{"from":2169.96,"to":2172.21,"location":2,"content":"we have the bias term as a vector of"},{"from":2172.21,"to":2174.64,"location":2,"content":"bias terms there's sort of a choice"},{"from":2174.64,"to":2176.92,"location":2,"content":"there you can either have an always-on"},{"from":2176.92,"to":2179.95,"location":2,"content":"input and then the bias terms become"},{"from":2179.95,"to":2181.78,"location":2,"content":"part of the weights of our slightly"},{"from":2181.78,"to":2186.28,"location":2,"content":"bigger matrix of one extra one extra"},{"from":2186.28,"to":2196.56,"location":2,"content":"either column or row one extra row right"},{"from":2196.56,"to":2198.94,"location":2,"content":"or you can just sort of have them"},{"from":2198.94,"to":2203.77,"location":2,"content":"separately written as bees okay and then"},{"from":2203.77,"to":2207.21,"location":2,"content":"the final note here right so once we've"},{"from":2207.21,"to":2210.67,"location":2,"content":"calculated this part we always put"},{"from":2210.67,"to":2214.12,"location":2,"content":"things through non-linearity which is"},{"from":2214.12,"to":2215.56,"location":2,"content":"referred to as the activation function"},{"from":2215.56,"to":2218.74,"location":2,"content":"and so something like the logistic"},{"from":2218.74,"to":2220.81,"location":2,"content":"transform I showed earlier is an"},{"from":2220.81,"to":2223.27,"location":2,"content":"activation function and this is written"},{"from":2223.27,"to":2228.58,"location":2,"content":"as sort of vector import activation"},{"from":2228.58,"to":2231.7,"location":2,"content":"function giving a vector output and what"},{"from":2231.7,"to":2233.83,"location":2,"content":"this always means is that we apply this"},{"from":2233.83,"to":2237.34,"location":2,"content":"function element-wise so we applying the"},{"from":2237.34,"to":2239.67,"location":2,"content":"logistic function which is sort of a"},{"from":2239.67,"to":2243.43,"location":2,"content":"naturally a 1 input 1 output function"},{"from":2243.43,"to":2246.01,"location":2,"content":"like the little graph I showed before so"},{"from":2246.01,"to":2248.68,"location":2,"content":"when we apply that to a vector we apply"},{"from":2248.68,"to":2251.23,"location":2,"content":"it to each element of the vector element"},{"from":2251.23,"to":2258.73,"location":2,"content":"wise ok we will come back very soon to"},{"from":2258.73,"to":2262.18,"location":2,"content":"sort of saying more about nonlinearities"},{"from":2262.18,"to":2265.48,"location":2,"content":"and what nonlinearities people actually"},{"from":2265.48,"to":2269.74,"location":2,"content":"use but you know something you might be"},{"from":2269.74,"to":2272.41,"location":2,"content":"wondering is well why does he always"},{"from":2272.41,"to":2274.3,"location":2,"content":"have these nonlinearities and say there"},{"from":2274.3,"to":2276.46,"location":2,"content":"has to be an F function there you know"},{"from":2276.46,"to":2277.96,"location":2,"content":"why don't we just"},{"from":2277.96,"to":2281.17,"location":2,"content":"calculate Z equals WX plus B in one"},{"from":2281.17,"to":2282.97,"location":2,"content":"layer and then go on to another layer"},{"from":2282.97,"to":2289.03,"location":2,"content":"that also does z2 equals W to z1 plus B"},{"from":2289.03,"to":2291.58,"location":2,"content":"and keep on going with layers like that"},{"from":2291.58,"to":2294.22,"location":2,"content":"and there's a very precise reason for"},{"from":2294.22,"to":2297.34,"location":2,"content":"that which is if you want to have a"},{"from":2297.34,"to":2300.21,"location":2,"content":"neural network learn anything"},{"from":2300.21,"to":2303.16,"location":2,"content":"interesting you have to stick in some"},{"from":2303.16,"to":2305.97,"location":2,"content":"function f which is a nonlinear function"},{"from":2305.97,"to":2309.34,"location":2,"content":"such as the logistic curve I showed"},{"from":2309.34,"to":2313.98,"location":2,"content":"before and the reason for that is that"},{"from":2313.98,"to":2317.46,"location":2,"content":"if you're sort of doing linear"},{"from":2317.46,"to":2321.6,"location":2,"content":"transforms like W X plus B and then W to"},{"from":2321.6,"to":2326.95,"location":2,"content":"z1 plus B w3 z2 plus B and you're doing"},{"from":2326.95,"to":2329.53,"location":2,"content":"a sequence of linear transforms well"},{"from":2329.53,"to":2332.41,"location":2,"content":"multiple linear transforms just composed"},{"from":2332.41,"to":2334.9,"location":2,"content":"to become a linear transform right so"},{"from":2334.9,"to":2338.41,"location":2,"content":"one linear transform is rotating and"},{"from":2338.41,"to":2340.96,"location":2,"content":"stretching the space somehow and you can"},{"from":2340.96,"to":2343.93,"location":2,"content":"rotate them stretch the space again but"},{"from":2343.93,"to":2345.85,"location":2,"content":"the result of that it's just one bigger"},{"from":2345.85,"to":2348.16,"location":2,"content":"rotate and stretch of the space so you"},{"from":2348.16,"to":2350.23,"location":2,"content":"don't get any extra power for a"},{"from":2350.23,"to":2353.08,"location":2,"content":"classifier by simply having multiple"},{"from":2353.08,"to":2356.17,"location":2,"content":"linear transforms but as soon as you"},{"from":2356.17,"to":2359.13,"location":2,"content":"stick in almost any kind of"},{"from":2359.13,"to":2362.53,"location":2,"content":"non-linearity then you get additional"},{"from":2362.53,"to":2366.7,"location":2,"content":"power and so you know for in general"},{"from":2366.7,"to":2369.07,"location":2,"content":"what we're doing when we're doing deep"},{"from":2369.07,"to":2372.61,"location":2,"content":"networks in the middle of them we're not"},{"from":2372.61,"to":2376.12,"location":2,"content":"thinking oh it's really important to"},{"from":2376.12,"to":2379.54,"location":2,"content":"have non-linearity thinking about"},{"from":2379.54,"to":2382.06,"location":2,"content":"probabilities or something like that our"},{"from":2382.06,"to":2385.24,"location":2,"content":"general picture is well we want to be"},{"from":2385.24,"to":2387.7,"location":2,"content":"able to do effective function"},{"from":2387.7,"to":2390.52,"location":2,"content":"approximation or curve fitting we'd like"},{"from":2390.52,"to":2393.07,"location":2,"content":"to learn a space like this and we can"},{"from":2393.07,"to":2395.65,"location":2,"content":"only do that if we're sort of putting in"},{"from":2395.65,"to":2398.29,"location":2,"content":"some nonlinearities which allow us to"},{"from":2398.29,"to":2401.16,"location":2,"content":"learn these kind of curvy decision"},{"from":2401.16,"to":2405.07,"location":2,"content":"patterns and so so if F is used"},{"from":2405.07,"to":2409.36,"location":2,"content":"effectively for doing accurate function"},{"from":2409.36,"to":2411.41,"location":2,"content":"approximation or sort of pattern"},{"from":2411.41,"to":2417.74,"location":2,"content":"matching as you go along okay I think"},{"from":2417.74,"to":2421.04,"location":2,"content":"I'm behind already okay so that was the"},{"from":2421.04,"to":2425.69,"location":2,"content":"intro to baby neural networks all good"},{"from":2425.69,"to":2434.21,"location":2,"content":"any questions yes you have feature one"},{"from":2434.21,"to":2436.85,"location":2,"content":"and feature four if you multiply them"},{"from":2436.85,"to":2438.38,"location":2,"content":"together it's highly indicative of like"},{"from":2438.38,"to":2440.48,"location":2,"content":"the label why can you get to that"},{"from":2440.48,"to":2446.63,"location":2,"content":"product relationships yeah good question"},{"from":2446.63,"to":2450.17,"location":2,"content":"so in conventional steps you have your"},{"from":2450.17,"to":2453.92,"location":2,"content":"basic input features and when people are"},{"from":2453.92,"to":2455.63,"location":2,"content":"building something like a logistic"},{"from":2455.63,"to":2459.17,"location":2,"content":"regression model by hand people often"},{"from":2459.17,"to":2461.15,"location":2,"content":"say well something that's really"},{"from":2461.15,"to":2464.33,"location":2,"content":"important for classification is looking"},{"from":2464.33,"to":2467.21,"location":2,"content":"at the pair of feature four and feature"},{"from":2467.21,"to":2470.6,"location":2,"content":"seven the you know if both of those are"},{"from":2470.6,"to":2472.15,"location":2,"content":"true at the same time something"},{"from":2472.15,"to":2474.53,"location":2,"content":"important happens and so that's referred"},{"from":2474.53,"to":2476.63,"location":2,"content":"to normally in stats as an interaction"},{"from":2476.63,"to":2479.8,"location":2,"content":"term and you can by hand air add"},{"from":2479.8,"to":2482.41,"location":2,"content":"interaction terms to your model so"},{"from":2482.41,"to":2485.39,"location":2,"content":"essentially a large part of the secret"},{"from":2485.39,"to":2488.9,"location":2,"content":"here is having these intermediate layers"},{"from":2488.9,"to":2492.44,"location":2,"content":"they can learn build interaction terms"},{"from":2492.44,"to":2496.63,"location":2,"content":"by themselves yeah so it's sort of"},{"from":2496.63,"to":2499.43,"location":2,"content":"automating the search for higher order"},{"from":2499.43,"to":2501.02,"location":2,"content":"terms that you want to put into your"},{"from":2501.02,"to":2510.2,"location":2,"content":"model okay I'll go on other questions"},{"from":2510.2,"to":2516.5,"location":2,"content":"okay so yeah so here's a brief little"},{"from":2516.5,"to":2519.56,"location":2,"content":"interlude on a teeny bit more of NLP"},{"from":2519.56,"to":2522.11,"location":2,"content":"which is sort of a kind of problem we're"},{"from":2522.11,"to":2523.97,"location":2,"content":"going to look at for a moment so this is"},{"from":2523.97,"to":2525.43,"location":2,"content":"the task of named entity recognition"},{"from":2525.43,"to":2528.44,"location":2,"content":"that I very briefly mentioned last time"},{"from":2528.44,"to":2535.04,"location":2,"content":"so if we have some text right wasn't"},{"from":2535.04,"to":2535.49,"location":2,"content":"piripi"},{"from":2535.49,"to":2537.98,"location":2,"content":"okay okay if we"},{"from":2537.98,"to":2540.14,"location":2,"content":"have some text something that in all"},{"from":2540.14,"to":2543.13,"location":2,"content":"sorts of places people want to do is"},{"from":2543.13,"to":2546.95,"location":2,"content":"they'd like to find the names of things"},{"from":2546.95,"to":2551.78,"location":2,"content":"that are mentioned and then normally as"},{"from":2551.78,"to":2553.46,"location":2,"content":"well as finding the names of things"},{"from":2553.46,"to":2556.04,"location":2,"content":"you'd actually like to classify them so"},{"from":2556.04,"to":2557.66,"location":2,"content":"it's like to say some of them are"},{"from":2557.66,"to":2560.71,"location":2,"content":"organization some of them are people"},{"from":2560.71,"to":2564.44,"location":2,"content":"some of them are places and so you know"},{"from":2564.44,"to":2567.14,"location":2,"content":"this has lots of users you know people"},{"from":2567.14,"to":2569.06,"location":2,"content":"like to track mentions of companies and"},{"from":2569.06,"to":2570.86,"location":2,"content":"people and newspapers and things like"},{"from":2570.86,"to":2573.73,"location":2,"content":"that people when they do"},{"from":2573.73,"to":2575.45,"location":2,"content":"question-answering that a lot of the"},{"from":2575.45,"to":2577.94,"location":2,"content":"time the answers to questions what we"},{"from":2577.94,"to":2580.28,"location":2,"content":"call named entities the names of people"},{"from":2580.28,"to":2583.76,"location":2,"content":"locations organizations pop songs movie"},{"from":2583.76,"to":2585.68,"location":2,"content":"names all of those kind of things are"},{"from":2585.68,"to":2590.09,"location":2,"content":"named entities and if you want to sort"},{"from":2590.09,"to":2591.65,"location":2,"content":"of start building up a knowledge base"},{"from":2591.65,"to":2593.93,"location":2,"content":"automatically from a lot of text well"},{"from":2593.93,"to":2595.61,"location":2,"content":"what you normally want to do is get out"},{"from":2595.61,"to":2598.85,"location":2,"content":"the named entities and get out relations"},{"from":2598.85,"to":2601.43,"location":2,"content":"between them so this is a common task so"},{"from":2601.43,"to":2605.06,"location":2,"content":"how can we go about doing that and a"},{"from":2605.06,"to":2608.33,"location":2,"content":"common way of doing that is to say well"},{"from":2608.33,"to":2611.51,"location":2,"content":"we're going to go through the words one"},{"from":2611.51,"to":2614.06,"location":2,"content":"at a time and they're going to be words"},{"from":2614.06,"to":2616.64,"location":2,"content":"that are in a context just like they"},{"from":2616.64,"to":2618.71,"location":2,"content":"were forward to back and what we're"},{"from":2618.71,"to":2621.14,"location":2,"content":"going to do is run a classifier and"},{"from":2621.14,"to":2623.75,"location":2,"content":"we're going to assign them a class so"},{"from":2623.75,"to":2625.49,"location":2,"content":"we're going to say first word is"},{"from":2625.49,"to":2627.89,"location":2,"content":"organization second words organization"},{"from":2627.89,"to":2630.71,"location":2,"content":"third word isn't a named entity fourth"},{"from":2630.71,"to":2632.72,"location":2,"content":"word as a person fifth word as a person"},{"from":2632.72,"to":2635.15,"location":2,"content":"and continue down so we're running a"},{"from":2635.15,"to":2638.39,"location":2,"content":"classification of a word within a"},{"from":2638.39,"to":2640.28,"location":2,"content":"position in the text so it's got"},{"from":2640.28,"to":2645.17,"location":2,"content":"surrounding words around it and so to"},{"from":2645.17,"to":2647.99,"location":2,"content":"say what the entities are many entities"},{"from":2647.99,"to":2651.05,"location":2,"content":"are multi word terms and so the simplest"},{"from":2651.05,"to":2653.39,"location":2,"content":"thing you can imagine doing is just say"},{"from":2653.39,"to":2655.01,"location":2,"content":"well take the sequence that are all"},{"from":2655.01,"to":2657.47,"location":2,"content":"classified the same and call that be it"},{"from":2657.47,"to":2660.43,"location":2,"content":"intuition gong or something like that"},{"from":2660.43,"to":2662.6,"location":2,"content":"there's a reason why that's slightly"},{"from":2662.6,"to":2664.52,"location":2,"content":"defective and so what people often use"},{"from":2664.52,"to":2667.79,"location":2,"content":"is that Biao encoding that I show on the"},{"from":2667.79,"to":2669.44,"location":2,"content":"right but I'll just going to run ahead"},{"from":2669.44,"to":2672.05,"location":2,"content":"and not do that now"},{"from":2672.05,"to":2674.24,"location":2,"content":"so it might seem at first that named"},{"from":2674.24,"to":2676.61,"location":2,"content":"entity recognition is trivial because"},{"from":2676.61,"to":2679.37,"location":2,"content":"you know you have company names Google"},{"from":2679.37,"to":2681.29,"location":2,"content":"and Facebook or company names and"},{"from":2681.29,"to":2683.63,"location":2,"content":"whenever you see Google or Facebook you"},{"from":2683.63,"to":2686.39,"location":2,"content":"just say company and how could you be"},{"from":2686.39,"to":2688.4,"location":2,"content":"wrong but in practice there's a lot of"},{"from":2688.4,"to":2690.02,"location":2,"content":"subtlety and it's easy to be wrong a"},{"from":2690.02,"to":2691.97,"location":2,"content":"named entity recognition so this is so"},{"from":2691.97,"to":2694.76,"location":2,"content":"just some of the hard cases so it's"},{"from":2694.76,"to":2698.03,"location":2,"content":"often hard to work out the boundaries of"},{"from":2698.03,"to":2700.31,"location":2,"content":"an entity so on this sentence First"},{"from":2700.31,"to":2703.04,"location":2,"content":"National Bank donate donate two vans to"},{"from":2703.04,"to":2705.62,"location":2,"content":"future school of Fort Smith so there's"},{"from":2705.62,"to":2708.14,"location":2,"content":"presumably the name of a bank there but"},{"from":2708.14,"to":2710.9,"location":2,"content":"is it National Bank and the first is"},{"from":2710.9,"to":2712.61,"location":2,"content":"just the first word of a sentence which"},{"from":2712.61,"to":2716.09,"location":2,"content":"is capitalized here like first she"},{"from":2716.09,"to":2718.94,"location":2,"content":"ordered some food or something so kind"},{"from":2718.94,"to":2721.55,"location":2,"content":"of unclear what it is sometimes it's"},{"from":2721.55,"to":2722.99,"location":2,"content":"hard to know whether something's an"},{"from":2722.99,"to":2725.54,"location":2,"content":"entity at all so at the end of this"},{"from":2725.54,"to":2729.17,"location":2,"content":"sentence is future school the name of"},{"from":2729.17,"to":2731.75,"location":2,"content":"some exciting kind of 21st century"},{"from":2731.75,"to":2734,"location":2,"content":"school or is it just meaning it's a"},{"from":2734,"to":2735.83,"location":2,"content":"future school that's going to be built"},{"from":2735.83,"to":2737.99,"location":2,"content":"in this town right is it an entity or"},{"from":2737.99,"to":2741.14,"location":2,"content":"not at all working out the class of an"},{"from":2741.14,"to":2744.17,"location":2,"content":"entity is often difficult so to find out"},{"from":2744.17,"to":2746.81,"location":2,"content":"more about Zig Ziglar and Reid features"},{"from":2746.81,"to":2750.65,"location":2,"content":"by you know what class is exhibiting you"},{"from":2750.65,"to":2752.48,"location":2,"content":"don't know I was actually a person's"},{"from":2752.48,"to":2756.95,"location":2,"content":"name and there are various entities that"},{"from":2756.95,"to":2761.18,"location":2,"content":"are ambiguous right so Charles Schwab in"},{"from":2761.18,"to":2765.8,"location":2,"content":"text is 90% of the time and organization"},{"from":2765.8,"to":2767.39,"location":2,"content":"name because there's Charles Schwab"},{"from":2767.39,"to":2769.43,"location":2,"content":"brokerage but in this particular"},{"from":2769.43,"to":2772.52,"location":2,"content":"sentence here in Woodside where Larry"},{"from":2772.52,"to":2774.2,"location":2,"content":"Ellison and Charles Schwab can live"},{"from":2774.2,"to":2777.14,"location":2,"content":"discreetly among wooded estates that is"},{"from":2777.14,"to":2778.76,"location":2,"content":"then a reference to Charles Schwab the"},{"from":2778.76,"to":2781.21,"location":2,"content":"person so it's sort of a fair bit of"},{"from":2781.21,"to":2783.89,"location":2,"content":"understanding variously that's needed to"},{"from":2783.89,"to":2788.42,"location":2,"content":"get it right okay so what are we going"},{"from":2788.42,"to":2792.13,"location":2,"content":"to do with that and so this suggests"},{"from":2792.13,"to":2795.98,"location":2,"content":"what we want to do is build classifiers"},{"from":2795.98,"to":2800.05,"location":2,"content":"for language that work inside a context"},{"from":2800.05,"to":2803.36,"location":2,"content":"so you know in general it's not very"},{"from":2803.36,"to":2805.7,"location":2,"content":"interesting classifying a word out"},{"from":2805.7,"to":2807.56,"location":2,"content":"out of context we don't actually do that"},{"from":2807.56,"to":2810.98,"location":2,"content":"much in NLP but once you're in a context"},{"from":2810.98,"to":2814.37,"location":2,"content":"there it's interesting to do and named"},{"from":2814.37,"to":2816.41,"location":2,"content":"entity recognition as one case there are"},{"from":2816.41,"to":2818.24,"location":2,"content":"lots of other places that comes up I"},{"from":2818.24,"to":2820.04,"location":2,"content":"mean here's a slightly cool one that"},{"from":2820.04,"to":2822.43,"location":2,"content":"there are some words that can mean"},{"from":2822.43,"to":2824.96,"location":2,"content":"themselves and their opposite at the"},{"from":2824.96,"to":2827.48,"location":2,"content":"same time right so to sanction something"},{"from":2827.48,"to":2830.33,"location":2,"content":"can either mean to allow something or it"},{"from":2830.33,"to":2832.67,"location":2,"content":"can mean to punish people who do things"},{"from":2832.67,"to":2836.93,"location":2,"content":"or to seed something can either mean to"},{"from":2836.93,"to":2839.06,"location":2,"content":"plant seeds and things so you're seeding"},{"from":2839.06,"to":2841.43,"location":2,"content":"the soil or it can take seeds out of"},{"from":2841.43,"to":2843.35,"location":2,"content":"something like a watermelon right you"},{"from":2843.35,"to":2845.12,"location":2,"content":"just need to know the context as to"},{"from":2845.12,"to":2849.44,"location":2,"content":"which it is okay so that suggests the"},{"from":2849.44,"to":2852.62,"location":2,"content":"task that we can classify a word in its"},{"from":2852.62,"to":2855.29,"location":2,"content":"context of neighboring words and any"},{"from":2855.29,"to":2857.24,"location":2,"content":"hours an example of that and the"},{"from":2857.24,"to":2859.79,"location":2,"content":"question is how might we do that and a"},{"from":2859.79,"to":2862.28,"location":2,"content":"very simple way to do it might be to say"},{"from":2862.28,"to":2865.36,"location":2,"content":"well we have a bunch of words in a row"},{"from":2865.36,"to":2868.37,"location":2,"content":"which each have a word vector from"},{"from":2868.37,"to":2871.1,"location":2,"content":"something like word to Veck maybe we"},{"from":2871.1,"to":2873.62,"location":2,"content":"could just average those word vectors"},{"from":2873.62,"to":2875.93,"location":2,"content":"and then classify the resulting vector"},{"from":2875.93,"to":2878.33,"location":2,"content":"and the problem is that doesn't work"},{"from":2878.33,"to":2881.03,"location":2,"content":"very well because you lose position"},{"from":2881.03,"to":2883.07,"location":2,"content":"information you don't actually know"},{"from":2883.07,"to":2885.95,"location":2,"content":"anymore which of those word vectors is"},{"from":2885.95,"to":2886.91,"location":2,"content":"the one that you're meant to be"},{"from":2886.91,"to":2890.09,"location":2,"content":"classifying so a simple way to do better"},{"from":2890.09,"to":2892.82,"location":2,"content":"than that is to say well why don't we"},{"from":2892.82,"to":2896.63,"location":2,"content":"make a big vector of a word window so"},{"from":2896.63,"to":2899.57,"location":2,"content":"here are words and they each have a word"},{"from":2899.57,"to":2903.23,"location":2,"content":"vector and so to classify the middle"},{"from":2903.23,"to":2905.9,"location":2,"content":"word in a context of here plus or minus"},{"from":2905.9,"to":2907.82,"location":2,"content":"two words we're simply going to"},{"from":2907.82,"to":2910.31,"location":2,"content":"concatenate these five vectors together"},{"from":2910.31,"to":2913.07,"location":2,"content":"and say now we have a bigger vector and"},{"from":2913.07,"to":2916.67,"location":2,"content":"let's build a classifier over that"},{"from":2916.67,"to":2918.65,"location":2,"content":"vector so we're classifying this X"},{"from":2918.65,"to":2922.67,"location":2,"content":"window which is then a vector in our 5d"},{"from":2922.67,"to":2925.46,"location":2,"content":"if we're using d dimensional word"},{"from":2925.46,"to":2930.47,"location":2,"content":"vectors and we can do that in the kind"},{"from":2930.47,"to":2936.13,"location":2,"content":"of way that we did previously which is"},{"from":2936.13,"to":2937.46,"location":2,"content":"that"},{"from":2937.46,"to":2940.67,"location":2,"content":"we could say okay for that big vector"},{"from":2940.67,"to":2943.43,"location":2,"content":"we're going to learn W weights and we're"},{"from":2943.43,"to":2944.93,"location":2,"content":"putting going to put it through a"},{"from":2944.93,"to":2947.72,"location":2,"content":"softmax classifier and then we're going"},{"from":2947.72,"to":2950.75,"location":2,"content":"to do the decisions that's a perfectly"},{"from":2950.75,"to":2955.7,"location":2,"content":"good way to do things and for the"},{"from":2955.7,"to":2957.71,"location":2,"content":"purpose of what I want to get to in the"},{"from":2957.71,"to":2960.62,"location":2,"content":"last part of this is to start looking at"},{"from":2960.62,"to":2964.61,"location":2,"content":"my matrix calculus and you know we could"},{"from":2964.61,"to":2967.97,"location":2,"content":"use this model and do a classifier and"},{"from":2967.97,"to":2970.85,"location":2,"content":"learn the weights of it and indeed for"},{"from":2970.85,"to":2972.95,"location":2,"content":"the handout on the website that we"},{"from":2972.95,"to":2976.67,"location":2,"content":"suggest you look at it does do it with"},{"from":2976.67,"to":2980.15,"location":2,"content":"the softmax classifier of precisely this"},{"from":2980.15,"to":2983.93,"location":2,"content":"kind but for the example I do in class I"},{"from":2983.93,"to":2987.65,"location":2,"content":"try to make it a bit simpler and I want"},{"from":2987.65,"to":2989.48,"location":2,"content":"to do this I think very quickly because"},{"from":2989.48,"to":2992.39,"location":2,"content":"I'm fast running out of time so one of"},{"from":2992.39,"to":2995.39,"location":2,"content":"the famous early papers of neural NLP"},{"from":2995.39,"to":2998.15,"location":2,"content":"was this paper by Cola bear in Weston"},{"from":2998.15,"to":3001.03,"location":2,"content":"which was first an icy ml paper in 2008"},{"from":3001.03,"to":3002.89,"location":2,"content":"which actually just a couple of weeks"},{"from":3002.89,"to":3007.53,"location":2,"content":"ago won the ICML 2018 test of time award"},{"from":3007.53,"to":3010.54,"location":2,"content":"and then at there's a more recent"},{"from":3010.54,"to":3015.46,"location":2,"content":"journal version of a 2011 and they use"},{"from":3015.46,"to":3018.79,"location":2,"content":"this idea of window classification to"},{"from":3018.79,"to":3022.15,"location":2,"content":"assign classes like named entities turn"},{"from":3022.15,"to":3026.02,"location":2,"content":"to words in context but they did it in a"},{"from":3026.02,"to":3029.56,"location":2,"content":"slightly different way so what they said"},{"from":3029.56,"to":3033.43,"location":2,"content":"is well we've got these windows and this"},{"from":3033.43,"to":3037.12,"location":2,"content":"is one with their location named entity"},{"from":3037.12,"to":3039.43,"location":2,"content":"in the middle and this is one without a"},{"from":3039.43,"to":3042.04,"location":2,"content":"location entity in the middle and so"},{"from":3042.04,"to":3045.85,"location":2,"content":"what we want to do is have a system that"},{"from":3045.85,"to":3048.64,"location":2,"content":"returns a score and it should return a"},{"from":3048.64,"to":3051.16,"location":2,"content":"high score just as a real number in this"},{"from":3051.16,"to":3053.86,"location":2,"content":"case and it can should return a low"},{"from":3053.86,"to":3057.85,"location":2,"content":"score if it if there isn't a location"},{"from":3057.85,"to":3059.44,"location":2,"content":"named in the middle of the window in"},{"from":3059.44,"to":3063.19,"location":2,"content":"this case and so explicitly the model"},{"from":3063.19,"to":3065.8,"location":2,"content":"just returned the score and so if you"},{"from":3065.8,"to":3069.09,"location":2,"content":"had the top level of your neural network"},{"from":3069.09,"to":3071.88,"location":2,"content":"a-and you just then dot-product it with"},{"from":3071.88,"to":3074.82,"location":2,"content":"a vector you you then kind of with that"},{"from":3074.82,"to":3078.09,"location":2,"content":"final dot product you just return a real"},{"from":3078.09,"to":3081.27,"location":2,"content":"number and they used that as the basis"},{"from":3081.27,"to":3083.58,"location":2,"content":"of their classifier so in full glory"},{"from":3083.58,"to":3086.79,"location":2,"content":"what you had is you had this window of"},{"from":3086.79,"to":3089.82,"location":2,"content":"words you looked up a word vector for"},{"from":3089.82,"to":3094.23,"location":2,"content":"each word you then multiplied that"},{"from":3094.23,"to":3097.05,"location":2,"content":"they've all you can concatenated the"},{"from":3097.05,"to":3099,"location":2,"content":"word vectors for the window you"},{"from":3099,"to":3101.28,"location":2,"content":"multiplied them by a matrix and added a"},{"from":3101.28,"to":3104.7,"location":2,"content":"bias to get a second hidden layer which"},{"from":3104.7,"to":3107.25,"location":2,"content":"is a and then you multiply that by a"},{"from":3107.25,"to":3109.89,"location":2,"content":"final vector and that gave you a score"},{"from":3109.89,"to":3112.56,"location":2,"content":"for the window and you wanted the score"},{"from":3112.56,"to":3115.29,"location":2,"content":"to be large if it was the location and"},{"from":3115.29,"to":3120.45,"location":2,"content":"small if it wasn't a location so in this"},{"from":3120.45,"to":3122.24,"location":2,"content":"sort of pretend example where we have"},{"from":3122.24,"to":3125.37,"location":2,"content":"four-dimensional word vectors that's"},{"from":3125.37,"to":3128.13,"location":2,"content":"meaning you know for the window this is"},{"from":3128.13,"to":3133.38,"location":2,"content":"a 20 by 1 vector for calculating the"},{"from":3133.38,"to":3135.42,"location":2,"content":"next hidden layer we've got an 8 by 20"},{"from":3135.42,"to":3138.18,"location":2,"content":"matrix plus a bias vector then we've got"},{"from":3138.18,"to":3139.98,"location":2,"content":"this sort of 8 dimensional second hidden"},{"from":3139.98,"to":3142.62,"location":2,"content":"layer and then we're computing a final"},{"from":3142.62,"to":3148.65,"location":2,"content":"real number okay and so crucially this"},{"from":3148.65,"to":3150.24,"location":2,"content":"is an example of what the question was"},{"from":3150.24,"to":3153.45,"location":2,"content":"about we've put in this extra layer here"},{"from":3153.45,"to":3155.22,"location":2,"content":"right we could have just said here's a"},{"from":3155.22,"to":3158.7,"location":2,"content":"word vector a big word vector of context"},{"from":3158.7,"to":3161.7,"location":2,"content":"let's just stick a softmax or logistic"},{"from":3161.7,"to":3164.13,"location":2,"content":"classification on top to say yes or no"},{"from":3164.13,"to":3166.41,"location":2,"content":"for location but by putting in that"},{"from":3166.41,"to":3169.86,"location":2,"content":"extra hidden layer precisely this extra"},{"from":3169.86,"to":3172.53,"location":2,"content":"hidden layer can calculate nonlinear"},{"from":3172.53,"to":3175.08,"location":2,"content":"interactions between the input word"},{"from":3175.08,"to":3177.96,"location":2,"content":"vectors so it can calculate things like"},{"from":3177.96,"to":3181.47,"location":2,"content":"if the first word is a word like Museum"},{"from":3181.47,"to":3183.9,"location":2,"content":"and the second and the second words a"},{"from":3183.9,"to":3186.98,"location":2,"content":"word like the preposition in or around"},{"from":3186.98,"to":3190.41,"location":2,"content":"then that's a very good signal that this"},{"from":3190.41,"to":3193.2,"location":2,"content":"should be location in the middle"},{"from":3193.2,"to":3196.26,"location":2,"content":"position of the window so extra layers"},{"from":3196.26,"to":3198.09,"location":2,"content":"are a neural network let us calculate"},{"from":3198.09,"to":3200.7,"location":2,"content":"these kind of interaction terms between"},{"from":3200.7,"to":3202.73,"location":2,"content":"our basic features"},{"from":3202.73,"to":3207.05,"location":2,"content":"okay so there was a few more slides here"},{"from":3207.05,"to":3208.7,"location":2,"content":"that sort of go through the details of"},{"from":3208.7,"to":3211.28,"location":2,"content":"their model but I'm gonna just skip"},{"from":3211.28,"to":3213.14,"location":2,"content":"those for now because I'm a little bit"},{"from":3213.14,"to":3216.05,"location":2,"content":"behind and at the end of it we've just"},{"from":3216.05,"to":3219.02,"location":2,"content":"got this score so this is our model"},{"from":3219.02,"to":3221.15,"location":2,"content":"which is the one that I just outlined"},{"from":3221.15,"to":3223.97,"location":2,"content":"where we're calculating the score and"},{"from":3223.97,"to":3227.12,"location":2,"content":"we're wanting a big score for a location"},{"from":3227.12,"to":3230.47,"location":2,"content":"and so what we're going to want to do is"},{"from":3230.47,"to":3237.38,"location":2,"content":"consider how we can use this model to"},{"from":3237.38,"to":3240.53,"location":2,"content":"learn our parameters and a neural"},{"from":3240.53,"to":3243.74,"location":2,"content":"network so in particular remember it's"},{"from":3243.74,"to":3246.77,"location":2,"content":"the same story we've had before we had a"},{"from":3246.77,"to":3249.89,"location":2,"content":"loss function J and we're wanting to"},{"from":3249.89,"to":3253.43,"location":2,"content":"work out the gradient with respect to"},{"from":3253.43,"to":3256.1,"location":2,"content":"our current theta parameters of the loss"},{"from":3256.1,"to":3258.44,"location":2,"content":"function then we want to sort of"},{"from":3258.44,"to":3262.73,"location":2,"content":"subtract a little multiple of that given"},{"from":3262.73,"to":3264.29,"location":2,"content":"by the learning rate from our current"},{"from":3264.29,"to":3267.23,"location":2,"content":"parameters to get updated parameters and"},{"from":3267.23,"to":3269.54,"location":2,"content":"if we repeatedly do then stochastic"},{"from":3269.54,"to":3271.52,"location":2,"content":"gradient descent will have better and"},{"from":3271.52,"to":3273.89,"location":2,"content":"better parameters which give higher"},{"from":3273.89,"to":3276.08,"location":2,"content":"probability to the things that we're"},{"from":3276.08,"to":3278.54,"location":2,"content":"actually observing in our training data"},{"from":3278.54,"to":3281.9,"location":2,"content":"and so the thing we want to know is well"},{"from":3281.9,"to":3285.07,"location":2,"content":"in general how can we do this"},{"from":3285.07,"to":3287.66,"location":2,"content":"differentiation and work out the"},{"from":3287.66,"to":3291.56,"location":2,"content":"gradient of our loss function and so I"},{"from":3291.56,"to":3293.45,"location":2,"content":"sort of wanted sort of this the"},{"from":3293.45,"to":3295.85,"location":2,"content":"remaining time in this lecture go"},{"from":3295.85,"to":3299.45,"location":2,"content":"through how we can do that by hand using"},{"from":3299.45,"to":3301.88,"location":2,"content":"math and then that'll lead into sort of"},{"from":3301.88,"to":3304.46,"location":2,"content":"discussing and more generally the back"},{"from":3304.46,"to":3307.63,"location":2,"content":"propagation algorithm for the next one"},{"from":3307.63,"to":3311.48,"location":2,"content":"okay so if we're doing gradients by hand"},{"from":3311.48,"to":3314.06,"location":2,"content":"while we're doing multi variable"},{"from":3314.06,"to":3317.03,"location":2,"content":"calculus multivariable derivatives but"},{"from":3317.03,"to":3319.79,"location":2,"content":"in particular normally the most useful"},{"from":3319.79,"to":3322.97,"location":2,"content":"way to think about this is as doing"},{"from":3322.97,"to":3325.04,"location":2,"content":"matrix calculus which means we're"},{"from":3325.04,"to":3327.23,"location":2,"content":"directly working with vectors and"},{"from":3327.23,"to":3330.83,"location":2,"content":"matrices to work out our gradients and"},{"from":3330.83,"to":3334.25,"location":2,"content":"that that's normally sort of much faster"},{"from":3334.25,"to":3336.65,"location":2,"content":"and more convenient for some"},{"from":3336.65,"to":3339.14,"location":2,"content":"rising annual Network layers than trying"},{"from":3339.14,"to":3341.93,"location":2,"content":"to do it in a non victories duay but"},{"from":3341.93,"to":3343.76,"location":2,"content":"that doesn't mean that's the only way to"},{"from":3343.76,"to":3345.92,"location":2,"content":"do it if you're sort of confused about"},{"from":3345.92,"to":3348.35,"location":2,"content":"what's going on sometimes thinking it"},{"from":3348.35,"to":3350.81,"location":2,"content":"through in the non vectorized way it can"},{"from":3350.81,"to":3352.25,"location":2,"content":"be a better way to understand what's"},{"from":3352.25,"to":3355.43,"location":2,"content":"going on and make more progress so like"},{"from":3355.43,"to":3359.47,"location":2,"content":"when last time I did the word to Vic"},{"from":3359.47,"to":3362.45,"location":2,"content":"derivatives when I was writing to small"},{"from":3362.45,"to":3365.21,"location":2,"content":"on that board sorry that was doing it in"},{"from":3365.21,"to":3367.76,"location":2,"content":"a non victories way of working out the"},{"from":3367.76,"to":3370.19,"location":2,"content":"weights talking about them individually"},{"from":3370.19,"to":3372.67,"location":2,"content":"but here we're going to do it with"},{"from":3372.67,"to":3375.95,"location":2,"content":"vectors and matrices and again look for"},{"from":3375.95,"to":3377.99,"location":2,"content":"the lecture notes to cover this material"},{"from":3377.99,"to":3381.02,"location":2,"content":"in more detail in particular so that no"},{"from":3381.02,"to":3383.48,"location":2,"content":"one misses ed I'm let me just clarify"},{"from":3383.48,"to":3385.43,"location":2,"content":"what I mean by lecture notes so if you"},{"from":3385.43,"to":3387.47,"location":2,"content":"look at the course syllabus on the"},{"from":3387.47,"to":3389.36,"location":2,"content":"left-hand column"},{"from":3389.36,"to":3391.67,"location":2,"content":"there's the slides that you can download"},{"from":3391.67,"to":3394.49,"location":2,"content":"and I'm straight under the slides it"},{"from":3394.49,"to":3396.44,"location":2,"content":"says lecture notes that's what I'm"},{"from":3396.44,"to":3398.72,"location":2,"content":"meaning by the lecture notes in the in"},{"from":3398.72,"to":3401.21,"location":2,"content":"the middle column it then has some"},{"from":3401.21,"to":3403.19,"location":2,"content":"readings and actually there are some"},{"from":3403.19,"to":3404.99,"location":2,"content":"different additional things there that"},{"from":3404.99,"to":3409.01,"location":2,"content":"cover similar material so there's so"},{"from":3409.01,"to":3410.81,"location":2,"content":"there will they might be helpful as well"},{"from":3410.81,"to":3413.45,"location":2,"content":"but that's the thing that's closest to"},{"from":3413.45,"to":3415.25,"location":2,"content":"what I'm about to present it's the"},{"from":3415.25,"to":3416.66,"location":2,"content":"lecture notes that appear immediately"},{"from":3416.66,"to":3420.44,"location":2,"content":"under the slides link okay"},{"from":3420.44,"to":3424.94,"location":2,"content":"so my hope here my hope here is the"},{"from":3424.94,"to":3428.21,"location":2,"content":"following if you can't remember - how to"},{"from":3428.21,"to":3431.48,"location":2,"content":"do single variable calculus sorry you're"},{"from":3431.48,"to":3433.03,"location":2,"content":"basically sunk and may as well leave now"},{"from":3433.03,"to":3435.83,"location":2,"content":"I'm assuming you know how to do single"},{"from":3435.83,"to":3438.26,"location":2,"content":"variable calculus and I'm assuming you"},{"from":3438.26,"to":3442.01,"location":2,"content":"know what a vector and a matrix is but"},{"from":3442.01,"to":3447.23,"location":2,"content":"you know I sort of hope that even if you"},{"from":3447.23,"to":3449.99,"location":2,"content":"never did multivariable calculus or you"},{"from":3449.99,"to":3452.57,"location":2,"content":"can't remember any of it it's sort of"},{"from":3452.57,"to":3454.79,"location":2,"content":"for what we have to do here not that"},{"from":3454.79,"to":3458.57,"location":2,"content":"hard and you can do it so here's what"},{"from":3458.57,"to":3463.43,"location":2,"content":"what you do right so if we have a simple"},{"from":3463.43,"to":3466.64,"location":2,"content":"function f of X equals x cubed right"},{"from":3466.64,"to":3469.76,"location":2,"content":"it's gradient and so the gradient is the"},{"from":3469.76,"to":3470.36,"location":2,"content":"slow"},{"from":3470.36,"to":3471.98,"location":2,"content":"right it's saying how steep or shallow"},{"from":3471.98,"to":3474.53,"location":2,"content":"is the slope of something and then we"},{"from":3474.53,"to":3477.2,"location":2,"content":"and also saw the direction of slope when"},{"from":3477.2,"to":3479.69,"location":2,"content":"we go into multiple dimensions its"},{"from":3479.69,"to":3481.85,"location":2,"content":"gradient is just its derivative so it's"},{"from":3481.85,"to":3485,"location":2,"content":"derivative is three x squared so if"},{"from":3485,"to":3487.4,"location":2,"content":"you're at the point x equals three that"},{"from":3487.4,"to":3490.58,"location":2,"content":"you know there's this 27 of slope eNOS"},{"from":3490.58,"to":3494.54,"location":2,"content":"it's very steep okay so well what if we"},{"from":3494.54,"to":3497.57,"location":2,"content":"have a function with one output but now"},{"from":3497.57,"to":3501.41,"location":2,"content":"it has many inputs so that we're sort of"},{"from":3501.41,"to":3505.55,"location":2,"content":"doing that sort of function that was"},{"from":3505.55,"to":3507.35,"location":2,"content":"like the dot products where we're doing"},{"from":3507.35,"to":3512.21,"location":2,"content":"the sort of U TV or WT x to calculate a"},{"from":3512.21,"to":3514.67,"location":2,"content":"value well then what we're going to"},{"from":3514.67,"to":3518.39,"location":2,"content":"calculate is a gradient which is a"},{"from":3518.39,"to":3520.94,"location":2,"content":"vector of partial derivatives with"},{"from":3520.94,"to":3525.41,"location":2,"content":"respect to each input so you take the"},{"from":3525.41,"to":3528.89,"location":2,"content":"slope of the function as you change X 1"},{"from":3528.89,"to":3532.01,"location":2,"content":"the slope of the function as you change"},{"from":3532.01,"to":3535.52,"location":2,"content":"X 2 through the slope of the function as"},{"from":3535.52,"to":3538.22,"location":2,"content":"you changed X N and each of these you"},{"from":3538.22,"to":3540.56,"location":2,"content":"can just calculate as if you're doing"},{"from":3540.56,"to":3543.05,"location":2,"content":"single variable calculus and you just"},{"from":3543.05,"to":3545.3,"location":2,"content":"put them all in a vector and that's then"},{"from":3545.3,"to":3547.55,"location":2,"content":"giving you the gradient and then the"},{"from":3547.55,"to":3551,"location":2,"content":"gradient and multi-dimensional space is"},{"from":3551,"to":3553.37,"location":2,"content":"then giving you the direction and slope"},{"from":3553.37,"to":3555.92,"location":2,"content":"of the sort of a surface that touches"},{"from":3555.92,"to":3560.75,"location":2,"content":"your multi-dimensional F function ok so"},{"from":3560.75,"to":3563.09,"location":2,"content":"that's going a bit scarier but it gets a"},{"from":3563.09,"to":3565.01,"location":2,"content":"little bit scarier than that because if"},{"from":3565.01,"to":3568.31,"location":2,"content":"we have a neural network layer we then"},{"from":3568.31,"to":3571.67,"location":2,"content":"have a function which will have n inputs"},{"from":3571.67,"to":3574.22,"location":2,"content":"which in the input neurons and it will"},{"from":3574.22,"to":3579.29,"location":2,"content":"have M outputs so if that's the case you"},{"from":3579.29,"to":3581.3,"location":2,"content":"then have a matrix of partial"},{"from":3581.3,"to":3583.52,"location":2,"content":"derivatives which is referred to as the"},{"from":3583.52,"to":3588.2,"location":2,"content":"Jacobian so in the Jacobian you're sort"},{"from":3588.2,"to":3591.89,"location":2,"content":"of taking these partial derivatives with"},{"from":3591.89,"to":3595.46,"location":2,"content":"respect to each output along the rows"},{"from":3595.46,"to":3599.42,"location":2,"content":"and with respect to each input down the"},{"from":3599.42,"to":3601.85,"location":2,"content":"columns and so you're getting these M by"},{"from":3601.85,"to":3604.25,"location":2,"content":"n partial derivatives"},{"from":3604.25,"to":3607.19,"location":2,"content":"considering every combination of an"},{"from":3607.19,"to":3611.33,"location":2,"content":"output and an input but again you can"},{"from":3611.33,"to":3613.97,"location":2,"content":"fill in every cell of this matrix just"},{"from":3613.97,"to":3615.5,"location":2,"content":"by doing single variable calculus"},{"from":3615.5,"to":3617.06,"location":2,"content":"providing you don't get yourself"},{"from":3617.06,"to":3622.61,"location":2,"content":"confused okay then we already saw when"},{"from":3622.61,"to":3625.34,"location":2,"content":"we're doing word to vac that sort of a"},{"from":3625.34,"to":3628.52,"location":2,"content":"central tool that we have to use to work"},{"from":3628.52,"to":3633.98,"location":2,"content":"out to work out our derivatives of"},{"from":3633.98,"to":3636.86,"location":2,"content":"something like a neural network model is"},{"from":3636.86,"to":3639.08,"location":2,"content":"we have a sequence of functions that we"},{"from":3639.08,"to":3642.44,"location":2,"content":"run up one after another so in a neural"},{"from":3642.44,"to":3643.58,"location":2,"content":"network you're sort of running a"},{"from":3643.58,"to":3645.26,"location":2,"content":"sequence of functions one after another"},{"from":3645.26,"to":3649.46,"location":2,"content":"so we have to use the chain rule to work"},{"from":3649.46,"to":3651.65,"location":2,"content":"out derivatives when we compose"},{"from":3651.65,"to":3654.17,"location":2,"content":"functions so if we have one variable"},{"from":3654.17,"to":3658.82,"location":2,"content":"functions so we have Z equals 3 y and y"},{"from":3658.82,"to":3662.98,"location":2,"content":"equals x squared if we want to work out"},{"from":3662.98,"to":3667.64,"location":2,"content":"the derivative of Z with respect to X we"},{"from":3667.64,"to":3669.98,"location":2,"content":"say AHA that's a composition of two"},{"from":3669.98,"to":3673.58,"location":2,"content":"functions so I use the chain rule and so"},{"from":3673.58,"to":3677.63,"location":2,"content":"that means what I do is I multiply the"},{"from":3677.63,"to":3684.37,"location":2,"content":"derivative so I take DZ dy so that's 2x"},{"from":3684.37,"to":3688.94,"location":2,"content":"wait sorry I said that wrong right"},{"from":3688.94,"to":3691.67,"location":2,"content":"it's my example wrong oh yeah right DZ"},{"from":3691.67,"to":3694.28,"location":2,"content":"dy so ya DZ dy it's just three that's"},{"from":3694.28,"to":3695.75,"location":2,"content":"all right that's the derivative of the"},{"from":3695.75,"to":3700.4,"location":2,"content":"top line and then dy DX is 2x and I"},{"from":3700.4,"to":3702.74,"location":2,"content":"multiply those together and I get the"},{"from":3702.74,"to":3706.13,"location":2,"content":"answer but the derivative of Z with"},{"from":3706.13,"to":3711.83,"location":2,"content":"respect to X is 6x ok this bit bin gets"},{"from":3711.83,"to":3713.9,"location":2,"content":"a little bit freakier but it's true if"},{"from":3713.9,"to":3717.8,"location":2,"content":"you have lots of variables at once you"},{"from":3717.8,"to":3721.01,"location":2,"content":"simply multiply the jacobians and you"},{"from":3721.01,"to":3723.29,"location":2,"content":"get the right answer so if we're now"},{"from":3723.29,"to":3725.84,"location":2,"content":"imagining our neural net well sort of"},{"from":3725.84,"to":3728.03,"location":2,"content":"this is our typical neural net right so"},{"from":3728.03,"to":3731.18,"location":2,"content":"we're doing the neural net layer where"},{"from":3731.18,"to":3733.34,"location":2,"content":"we have our weight matrix multiplied"},{"from":3733.34,"to":3736.43,"location":2,"content":"their input vector plus the bias and"},{"from":3736.43,"to":3737.84,"location":2,"content":"then we're putting it through"},{"from":3737.84,"to":3740.48,"location":2,"content":"and non-linearity and then if we want to"},{"from":3740.48,"to":3743.6,"location":2,"content":"know what's the partials of H with"},{"from":3743.6,"to":3746.06,"location":2,"content":"respect to X we just say huh it's a"},{"from":3746.06,"to":3748.19,"location":2,"content":"function composition so this is easy to"},{"from":3748.19,"to":3751.22,"location":2,"content":"do we work out our first Jacobian which"},{"from":3751.22,"to":3753.5,"location":2,"content":"is the pass rules of H with respect to Z"},{"from":3753.5,"to":3755.6,"location":2,"content":"and then we just multiply it by the"},{"from":3755.6,"to":3758.39,"location":2,"content":"partials of Z with respect to X and we"},{"from":3758.39,"to":3762.43,"location":2,"content":"get the right answer"},{"from":3762.43,"to":3764.42,"location":2,"content":"easy"},{"from":3764.42,"to":3769.67,"location":2,"content":"so here's sort of an example Jacobian"},{"from":3769.67,"to":3771.65,"location":2,"content":"which is a service special case that"},{"from":3771.65,"to":3774.77,"location":2,"content":"comes up a lot so it's just good to"},{"from":3774.77,"to":3778.1,"location":2,"content":"realize this one which we'll see with"},{"from":3778.1,"to":3780.32,"location":2,"content":"our neural net so well one of the things"},{"from":3780.32,"to":3781.79,"location":2,"content":"that we have of these element-wise"},{"from":3781.79,"to":3784.67,"location":2,"content":"activation functions so we have h equals"},{"from":3784.67,"to":3789.71,"location":2,"content":"f of z so what is the partial derivative"},{"from":3789.71,"to":3795.5,"location":2,"content":"of H with respect to Z well the thing"},{"from":3795.5,"to":3797.45,"location":2,"content":"remember that we sort of apply this"},{"from":3797.45,"to":3800,"location":2,"content":"element-wise they're actually saying H I"},{"from":3800,"to":3804.89,"location":2,"content":"equals F of Z I so you know formally"},{"from":3804.89,"to":3808.19,"location":2,"content":"this function has n inputs and an output"},{"from":3808.19,"to":3810.92,"location":2,"content":"so it's partial derivatives are going to"},{"from":3810.92,"to":3813.74,"location":2,"content":"be an N by n Jacobian but if we think"},{"from":3813.74,"to":3818.57,"location":2,"content":"about what's happening there what we're"},{"from":3818.57,"to":3820.85,"location":2,"content":"actually going to find is sort of when"},{"from":3820.85,"to":3823.58,"location":2,"content":"we're working out the terms of this so"},{"from":3823.58,"to":3826.42,"location":2,"content":"we're working out how does f of Z I"},{"from":3826.42,"to":3833.42,"location":2,"content":"change as you change Z J well if J is"},{"from":3833.42,"to":3836.06,"location":2,"content":"not equal to I it's going to make no"},{"from":3836.06,"to":3838.28,"location":2,"content":"difference at all right so if my F"},{"from":3838.28,"to":3839.78,"location":2,"content":"function is something like putting it"},{"from":3839.78,"to":3841.31,"location":2,"content":"through the logistic function or"},{"from":3841.31,"to":3843.92,"location":2,"content":"anything else absolute valuing a number"},{"from":3843.92,"to":3846.2,"location":2,"content":"it's going to make no difference to the"},{"from":3846.2,"to":3849.71,"location":2,"content":"calculation of F of Z I if I change Z J"},{"from":3849.71,"to":3851.69,"location":2,"content":"because it's just not in the equation"},{"from":3851.69,"to":3854.93,"location":2,"content":"and so therefore the only terms that are"},{"from":3854.93,"to":3857.72,"location":2,"content":"actually going to occur and be nonzero"},{"from":3857.72,"to":3862.58,"location":2,"content":"are the terms where I equals J so for"},{"from":3862.58,"to":3864.62,"location":2,"content":"working out these partial derivatives if"},{"from":3864.62,"to":3869.33,"location":2,"content":"I does not equal J it's 0 if I does"},{"from":3869.33,"to":3870.79,"location":2,"content":"equal J"},{"from":3870.79,"to":3873.4,"location":2,"content":"we have to work out as single variable"},{"from":3873.4,"to":3877.72,"location":2,"content":"calculus what's the derivative of the"},{"from":3877.72,"to":3884.26,"location":2,"content":"activation function for and so this is"},{"from":3884.26,"to":3887.92,"location":2,"content":"what our Jacobian looks like for an"},{"from":3887.92,"to":3889.33,"location":2,"content":"activation function"},{"from":3889.33,"to":3892.3,"location":2,"content":"it's a diagonal matrix everything else"},{"from":3892.3,"to":3895.03,"location":2,"content":"is zero and we've got this activation"},{"from":3895.03,"to":3897.37,"location":2,"content":"function we work out its derivative and"},{"from":3897.37,"to":3899.17,"location":2,"content":"then we calculate that for the"},{"from":3899.17,"to":3901.75,"location":2,"content":"difference we have it for the different"},{"from":3901.75,"to":3908.52,"location":2,"content":"kind of zi values okay"},{"from":3908.52,"to":3913.36,"location":2,"content":"so that's a jacobians for an activation"},{"from":3913.36,"to":3915.48,"location":2,"content":"function what are the other main cases"},{"from":3915.48,"to":3918.19,"location":2,"content":"that we need for a neural network and"},{"from":3918.19,"to":3920.44,"location":2,"content":"these are gone through a little bit more"},{"from":3920.44,"to":3924.01,"location":2,"content":"slowly in these same lecture notes but"},{"from":3924.01,"to":3925.72,"location":2,"content":"they're kind of similar to what we saw"},{"from":3925.72,"to":3928.48,"location":2,"content":"in the very first class so if we are"},{"from":3928.48,"to":3930.04,"location":2,"content":"wanting to work out the partial"},{"from":3930.04,"to":3932.92,"location":2,"content":"derivatives of W X plus B with respect"},{"from":3932.92,"to":3942.22,"location":2,"content":"to X what we get is W and if we want to"},{"from":3942.22,"to":3944.29,"location":2,"content":"work out the partial derivative of W X"},{"from":3944.29,"to":3948.49,"location":2,"content":"plus B with respect to B that's means"},{"from":3948.49,"to":3951.91,"location":2,"content":"that we get an identity matrix because B"},{"from":3951.91,"to":3954.31,"location":2,"content":"is sort of like a 1b right it's this"},{"from":3954.31,"to":3956.68,"location":2,"content":"almost always on vector so you're just"},{"from":3956.68,"to":3958.45,"location":2,"content":"getting the ones coming out to preserve"},{"from":3958.45,"to":3965.29,"location":2,"content":"the B this was the case that we saw when"},{"from":3965.29,"to":3967.87,"location":2,"content":"we were doing the word vectors that if"},{"from":3967.87,"to":3971.77,"location":2,"content":"you have a vector dot product of U and"},{"from":3971.77,"to":3974.2,"location":2,"content":"age and you say what's the partial"},{"from":3974.2,"to":3977.11,"location":2,"content":"derivatives of that with respect to U"},{"from":3977.11,"to":3982.75,"location":2,"content":"then you get out H transpose if you"},{"from":3982.75,"to":3985.51,"location":2,"content":"haven't seen those before"},{"from":3985.51,"to":3989.14,"location":2,"content":"look at the lecture note handouts and"},{"from":3989.14,"to":3991.06,"location":2,"content":"see if you can compute them and they"},{"from":3991.06,"to":3995.35,"location":2,"content":"make sense at home but for the moment"},{"from":3995.35,"to":3997.75,"location":2,"content":"we're going to believe those and use"},{"from":3997.75,"to":4000.87,"location":2,"content":"those to see how we can then work out"},{"from":4000.87,"to":4004,"location":2,"content":"derivatives inside in your network"},{"from":4004,"to":4007.34,"location":2,"content":"okay so here's this same neural network"},{"from":4007.34,"to":4010.28,"location":2,"content":"we saw before so we have a window of"},{"from":4010.28,"to":4013.31,"location":2,"content":"words we're looking upward vectors we're"},{"from":4013.31,"to":4015.02,"location":2,"content":"putting it through a hidden layer and"},{"from":4015.02,"to":4017.02,"location":2,"content":"then we're just doing a vector model"},{"from":4017.02,"to":4020.39,"location":2,"content":"vector dot product yet this final score"},{"from":4020.39,"to":4023.63,"location":2,"content":"and so what we want to do to be able to"},{"from":4023.63,"to":4026.57,"location":2,"content":"train our neural network is we want to"},{"from":4026.57,"to":4032.42,"location":2,"content":"find out how how s changes depending on"},{"from":4032.42,"to":4035.87,"location":2,"content":"all the parameters of the model the X"},{"from":4035.87,"to":4040.07,"location":2,"content":"the W the be the U and so we want to"},{"from":4040.07,"to":4043.82,"location":2,"content":"work out partial derivatives of s with"},{"from":4043.82,"to":4047,"location":2,"content":"respect to each of those because we can"},{"from":4047,"to":4050.78,"location":2,"content":"then work out okay if you move B up the"},{"from":4050.78,"to":4054.05,"location":2,"content":"score gets better which is good if it's"},{"from":4054.05,"to":4056.3,"location":2,"content":"actually a Paris in the middle and"},{"from":4056.3,"to":4058.24,"location":2,"content":"therefore we will want to nudge up"},{"from":4058.24,"to":4063.71,"location":2,"content":"elements of B appropriately okay and so"},{"from":4063.71,"to":4065.06,"location":2,"content":"I'm just doing the gradient with respect"},{"from":4065.06,"to":4067.01,"location":2,"content":"to the score here and I skipped over"},{"from":4067.01,"to":4070.58,"location":2,"content":"those couple of slides so if you just"},{"from":4070.58,"to":4072.68,"location":2,"content":"sort of staring at this picture and say"},{"from":4072.68,"to":4074.54,"location":2,"content":"well how do I work out the partial"},{"from":4074.54,"to":4078.37,"location":2,"content":"derivatives of s with respect to B"},{"from":4078.37,"to":4081.32,"location":2,"content":"probably doesn't look obvious so the"},{"from":4081.32,"to":4083.27,"location":2,"content":"first thing you that you want to do is"},{"from":4083.27,"to":4086.03,"location":2,"content":"sort of break up the equations into"},{"from":4086.03,"to":4088.01,"location":2,"content":"simple pieces that compose together"},{"from":4088.01,"to":4092.51,"location":2,"content":"right so you have the input X and then"},{"from":4092.51,"to":4096.38,"location":2,"content":"that goes into Z equals W X plus B and"},{"from":4096.38,"to":4099.14,"location":2,"content":"then you compose that with the next"},{"from":4099.14,"to":4102.26,"location":2,"content":"thing so H equals F of Z our activation"},{"from":4102.26,"to":4104.69,"location":2,"content":"function and then this H goes into the"},{"from":4104.69,"to":4107.99,"location":2,"content":"next thing of s equals u th so we've got"},{"from":4107.99,"to":4110.87,"location":2,"content":"these sequence of functions and pretty"},{"from":4110.87,"to":4114.29,"location":2,"content":"much you want to break things up as much"},{"from":4114.29,"to":4116.33,"location":2,"content":"as you can I mean I could have broken"},{"from":4116.33,"to":4119.06,"location":2,"content":"this up even further I could have said Z"},{"from":4119.06,"to":4124.25,"location":2,"content":"1 equals W X Z equals Z 1 plus B it"},{"from":4124.25,"to":4126.65,"location":2,"content":"turns out that if you've just got things"},{"from":4126.65,"to":4128.93,"location":2,"content":"added and subtracted you can sort of do"},{"from":4128.93,"to":4130.7,"location":2,"content":"that in one step because there's sort of"},{"from":4130.7,"to":4133.31,"location":2,"content":"pathways separate and there when doing"},{"from":4133.31,"to":4134.87,"location":2,"content":"the derivatives but sort of anything"},{"from":4134.87,"to":4137.09,"location":2,"content":"else that composes together you want to"},{"from":4137.09,"to":4137.57,"location":2,"content":"pull it"},{"from":4137.57,"to":4140.9,"location":2,"content":"for the pieces okay so now a neural net"},{"from":4140.9,"to":4144.11,"location":2,"content":"is doing a sequence of function"},{"from":4144.11,"to":4147.11,"location":2,"content":"compositions and when we say okay we"},{"from":4147.11,"to":4149.6,"location":2,"content":"know how to do that the chain rule so if"},{"from":4149.6,"to":4152.36,"location":2,"content":"you want to work out the partial of s"},{"from":4152.36,"to":4155.06,"location":2,"content":"with respect to B it's just going to be"},{"from":4155.06,"to":4158.18,"location":2,"content":"the product of the derivatives of each"},{"from":4158.18,"to":4161.05,"location":2,"content":"step along the way so it's going to be"},{"from":4161.05,"to":4163.97,"location":2,"content":"the partial of this with respect to H"},{"from":4163.97,"to":4166.91,"location":2,"content":"times H with respect to Z times Z with"},{"from":4166.91,"to":4169.31,"location":2,"content":"respect to B and that will give us the"},{"from":4169.31,"to":4172.31,"location":2,"content":"right answer so then all we have to do"},{"from":4172.31,"to":4178.13,"location":2,"content":"is actually compute that so I think this"},{"from":4178.13,"to":4179.84,"location":2,"content":"just sort of shows okay we're taking the"},{"from":4179.84,"to":4181.57,"location":2,"content":"partial sweet step of that composition"},{"from":4181.57,"to":4185.3,"location":2,"content":"okay so now we want to compute that and"},{"from":4185.3,"to":4187.07,"location":2,"content":"so this is where I'm going to sort of"},{"from":4187.07,"to":4190.19,"location":2,"content":"use the jacobians that I sort of"},{"from":4190.19,"to":4192.95,"location":2,"content":"asserted without much proof on the"},{"from":4192.95,"to":4198.83,"location":2,"content":"preceding slide okay so first of all we"},{"from":4198.83,"to":4202.4,"location":2,"content":"have D sdh well that's the dot product"},{"from":4202.4,"to":4208.85,"location":2,"content":"of two vectors so the the Jacobian for"},{"from":4208.85,"to":4211.94,"location":2,"content":"that is just H transpose okay that's a"},{"from":4211.94,"to":4216.2,"location":2,"content":"start then we have H equals F of Z well"},{"from":4216.2,"to":4220.3,"location":2,"content":"that's the activation function so the"},{"from":4220.3,"to":4223.16,"location":2,"content":"Jacobian of that is this diagonal matrix"},{"from":4223.16,"to":4227,"location":2,"content":"made of the element wise derivative of"},{"from":4227,"to":4231.29,"location":2,"content":"the function f and then we have the"},{"from":4231.29,"to":4233.78,"location":2,"content":"partial of Z with respect to B and"},{"from":4233.78,"to":4235.58,"location":2,"content":"that's the bit that comes out as the"},{"from":4235.58,"to":4238.04,"location":2,"content":"identity matrix and so that's then"},{"from":4238.04,"to":4243.44,"location":2,"content":"giving us our calculation of the partial"},{"from":4243.44,"to":4248.75,"location":2,"content":"of s with respect to B and so we can see"},{"from":4248.75,"to":4251.6,"location":2,"content":"that the the identity matrix sort of"},{"from":4251.6,"to":4254.3,"location":2,"content":"goes away so we end up with this"},{"from":4254.3,"to":4257.62,"location":2,"content":"composition of H T times F prime of Z"},{"from":4257.62,"to":4260.9,"location":2,"content":"okay suppose we then want to go on and"},{"from":4260.9,"to":4264.44,"location":2,"content":"compute now the partial of s with"},{"from":4264.44,"to":4267.8,"location":2,"content":"respect to W well our starting off point"},{"from":4267.8,"to":4270.74,"location":2,"content":"is exactly the same chain rule that we"},{"from":4270.74,"to":4271.28,"location":2,"content":"work"},{"from":4271.28,"to":4276.4,"location":2,"content":"each of the stages so that first of all"},{"from":4276.4,"to":4281.78,"location":2,"content":"you're working at the Z from the W X"},{"from":4281.78,"to":4283.4,"location":2,"content":"part then putting it through the"},{"from":4283.4,"to":4286.91,"location":2,"content":"non-linearity then doing the dot product"},{"from":4286.91,"to":4289.25,"location":2,"content":"of the vectors so that part is the same"},{"from":4289.25,"to":4293.18,"location":2,"content":"and what you should notice is that if"},{"from":4293.18,"to":4295.79,"location":2,"content":"you compare the partial of s with"},{"from":4295.79,"to":4299.8,"location":2,"content":"respect to W versus s with respect to B"},{"from":4299.8,"to":4302.93,"location":2,"content":"most of them are the same and it's only"},{"from":4302.93,"to":4305.66,"location":2,"content":"the part at the end that's different and"},{"from":4305.66,"to":4307.94,"location":2,"content":"that sort of makes sense in terms of our"},{"from":4307.94,"to":4311.33,"location":2,"content":"neural net right that when we had our"},{"from":4311.33,"to":4316.16,"location":2,"content":"neural net that the W and the B were"},{"from":4316.16,"to":4318.53,"location":2,"content":"coming in here and once you've sort of"},{"from":4318.53,"to":4320.75,"location":2,"content":"done some stuff with them you're putting"},{"from":4320.75,"to":4323.06,"location":2,"content":"things through the same activation"},{"from":4323.06,"to":4325.64,"location":2,"content":"function and doing the same dot product"},{"from":4325.64,"to":4327.77,"location":2,"content":"to create a score so you're sort of"},{"from":4327.77,"to":4329.87,"location":2,"content":"doing the same calculations that you are"},{"from":4329.87,"to":4331.88,"location":2,"content":"then composing with so it sort of makes"},{"from":4331.88,"to":4333.92,"location":2,"content":"sense that you should be getting the"},{"from":4333.92,"to":4337.16,"location":2,"content":"same derivatives that are occur same"},{"from":4337.16,"to":4338.96,"location":2,"content":"partial derivatives that occurring at"},{"from":4338.96,"to":4346.13,"location":2,"content":"that point and so effectively you know"},{"from":4346.13,"to":4349.85,"location":2,"content":"these partial derivatives correspond to"},{"from":4349.85,"to":4352.16,"location":2,"content":"the computations in the neural network"},{"from":4352.16,"to":4356.3,"location":2,"content":"that are above where W and B are and so"},{"from":4356.3,"to":4360.47,"location":2,"content":"those are commonly referred to as Delta"},{"from":4360.47,"to":4362.42,"location":2,"content":"note Delta which is different from"},{"from":4362.42,"to":4365.72,"location":2,"content":"partial derivative D and so Delta is"},{"from":4365.72,"to":4368.21,"location":2,"content":"referred to as the error signal a neural"},{"from":4368.21,"to":4370.97,"location":2,"content":"network torque so it's the what you're"},{"from":4370.97,"to":4373.82,"location":2,"content":"calculating as the partial derivatives"},{"from":4373.82,"to":4376.88,"location":2,"content":"above the parameters that you're working"},{"from":4376.88,"to":4378.41,"location":2,"content":"out the partial derivatives with respect"},{"from":4378.41,"to":4383.27,"location":2,"content":"to so a lot of the secret as we'll see"},{"from":4383.27,"to":4387.62,"location":2,"content":"next time a lot of the secret of what"},{"from":4387.62,"to":4392.09,"location":2,"content":"happens with back propagation is just we"},{"from":4392.09,"to":4395.3,"location":2,"content":"want to do efficient computation in the"},{"from":4395.3,"to":4397.07,"location":2,"content":"sort of way that's computer science"},{"from":4397.07,"to":4398.72,"location":2,"content":"people like to do efficient computation"},{"from":4398.72,"to":4401.81,"location":2,"content":"and so precisely what we want to notice"},{"from":4401.81,"to":4404.87,"location":2,"content":"is that there is one error signal that"},{"from":4404.87,"to":4405.14,"location":2,"content":"come"},{"from":4405.14,"to":4407.6,"location":2,"content":"from above and we want to compute at"},{"from":4407.6,"to":4410.15,"location":2,"content":"once and then reuse that when"},{"from":4410.15,"to":4412.91,"location":2,"content":"calculating both partial derivatives"},{"from":4412.91,"to":4418.6,"location":2,"content":"with respect to W and with B okay so"},{"from":4418.6,"to":4422.83,"location":2,"content":"there are sort of two things to still do"},{"from":4422.83,"to":4428.45,"location":2,"content":"so one is well it'd be kind of useful to"},{"from":4428.45,"to":4430.31,"location":2,"content":"know what the partial derivative of s"},{"from":4430.31,"to":4433.25,"location":2,"content":"with respect to W actually looks like I"},{"from":4433.25,"to":4436.48,"location":2,"content":"mean is that a number vector a matrix a"},{"from":4436.48,"to":4438.89,"location":2,"content":"three-dimensional tensor and then we"},{"from":4438.89,"to":4441.89,"location":2,"content":"actually want to work out its values and"},{"from":4441.89,"to":4445.07,"location":2,"content":"to work out its values we're going to"},{"from":4445.07,"to":4446.69,"location":2,"content":"still have to work out the partial"},{"from":4446.69,"to":4450.05,"location":2,"content":"derivative of Z with respect to W but if"},{"from":4450.05,"to":4452.36,"location":2,"content":"first of all we just try and work out"},{"from":4452.36,"to":4456.47,"location":2,"content":"its shape what kind of shape does it"},{"from":4456.47,"to":4459.05,"location":2,"content":"have and this is actually sort of a bit"},{"from":4459.05,"to":4462.26,"location":2,"content":"tricky and is sort of a dirty underbelly"},{"from":4462.26,"to":4465.53,"location":2,"content":"of doing this kind of matrix calculus so"},{"from":4465.53,"to":4469.1,"location":2,"content":"since our weight vector is an N by M"},{"from":4469.1,"to":4474.56,"location":2,"content":"matrix the end result of the partial of"},{"from":4474.56,"to":4476.6,"location":2,"content":"s with respect to W is we have a"},{"from":4476.6,"to":4480.53,"location":2,"content":"function with n times M inputs all of"},{"from":4480.53,"to":4483.89,"location":2,"content":"the elements of W and simply one output"},{"from":4483.89,"to":4487.07,"location":2,"content":"which is our score so that makes it"},{"from":4487.07,"to":4488.81,"location":2,"content":"sound like according to what I said"},{"from":4488.81,"to":4492.97,"location":2,"content":"before we should have a 1 by n times m"},{"from":4492.97,"to":4495.65,"location":2,"content":"Jacobian but it turns out that's not"},{"from":4495.65,"to":4497.33,"location":2,"content":"really what we want right because what"},{"from":4497.33,"to":4501.14,"location":2,"content":"we wanted to do is use what we calculate"},{"from":4501.14,"to":4504.82,"location":2,"content":"inside this stochastic gradient descent"},{"from":4504.82,"to":4509.69,"location":2,"content":"update algorithm and if we're doing this"},{"from":4509.69,"to":4513.74,"location":2,"content":"we'd sort of like to have the old white"},{"from":4513.74,"to":4516.8,"location":2,"content":"matrix and we'd like to subtract a bit"},{"from":4516.8,"to":4519.41,"location":2,"content":"format to get a new weight matrix so"},{"from":4519.41,"to":4522.02,"location":2,"content":"it'd be kind of nice if the shape of our"},{"from":4522.02,"to":4527.74,"location":2,"content":"Jacobian was the same shape as W and so"},{"from":4527.74,"to":4531.41,"location":2,"content":"we we and in general what you always"},{"from":4531.41,"to":4534.44,"location":2,"content":"want to do with neural nets is follow"},{"from":4534.44,"to":4537.38,"location":2,"content":"what we call the shape convention which"},{"from":4537.38,"to":4538.94,"location":2,"content":"is we're going to sort of"},{"from":4538.94,"to":4542.69,"location":2,"content":"represent the jacobian so it's in the"},{"from":4542.69,"to":4547.19,"location":2,"content":"same shape as the inputs and this whole"},{"from":4547.19,"to":4552.68,"location":2,"content":"thing is kind of the bad part of the bad"},{"from":4552.68,"to":4556.28,"location":2,"content":"part of doing matrix calculus like"},{"from":4556.28,"to":4558.38,"location":2,"content":"there's a lot of inconsistency as to how"},{"from":4558.38,"to":4561.02,"location":2,"content":"people represent matrix calculus that in"},{"from":4561.02,"to":4562.25,"location":2,"content":"general if you just go to different"},{"from":4562.25,"to":4564.32,"location":2,"content":"fields like economics and physics some"},{"from":4564.32,"to":4566.12,"location":2,"content":"people use a numerator convention some"},{"from":4566.12,"to":4567.98,"location":2,"content":"people use a denominator convention"},{"from":4567.98,"to":4569.6,"location":2,"content":"we're using neither of those we're going"},{"from":4569.6,"to":4571.85,"location":2,"content":"to use this shape convention so we match"},{"from":4571.85,"to":4573.74,"location":2,"content":"the shape of the input so it makes it"},{"from":4573.74,"to":4579.19,"location":2,"content":"easy to do our weight updates ok so"},{"from":4579.19,"to":4581.9,"location":2,"content":"right so that's what we want the answer"},{"from":4581.9,"to":4584.42,"location":2,"content":"to look like so then the final thing we"},{"from":4584.42,"to":4587.15,"location":2,"content":"need to do to work out and the partial"},{"from":4587.15,"to":4589.22,"location":2,"content":"of s with respect to W is we have the"},{"from":4589.22,"to":4590.75,"location":2,"content":"error signal Delta that's going to be"},{"from":4590.75,"to":4592.82,"location":2,"content":"part of the answer and then we want to"},{"from":4592.82,"to":4595.67,"location":2,"content":"work out the partial of Z with respect"},{"from":4595.67,"to":4604.67,"location":2,"content":"to W well what's that going to be well"},{"from":4604.67,"to":4608.03,"location":2,"content":"it turns out and I'm about to be Saved"},{"from":4608.03,"to":4610.37,"location":2,"content":"by the Bell here since I'm down to two"},{"from":4610.37,"to":4613.46,"location":2,"content":"minutes left it turns out that what we"},{"from":4613.46,"to":4617.75,"location":2,"content":"end up with for that is we take the"},{"from":4617.75,"to":4621.23,"location":2,"content":"product of the partial the product of"},{"from":4621.23,"to":4624.38,"location":2,"content":"Delta times X so effectively we've got a"},{"from":4624.38,"to":4628.04,"location":2,"content":"local Eris's signal above W and then we"},{"from":4628.04,"to":4630.59,"location":2,"content":"have the inputs X and we're working out"},{"from":4630.59,"to":4635.9,"location":2,"content":"an outer product of them and the sort of"},{"from":4635.9,"to":4638.18,"location":2,"content":"way to think about this is sort of for"},{"from":4638.18,"to":4641.42,"location":2,"content":"the W's you know we've got the elements"},{"from":4641.42,"to":4644.27,"location":2,"content":"of the W matrix are these different"},{"from":4644.27,"to":4647.21,"location":2,"content":"connections between our neurons and so"},{"from":4647.21,"to":4649.94,"location":2,"content":"each one of these is connecting one"},{"from":4649.94,"to":4653.66,"location":2,"content":"output to one input and so we're going"},{"from":4653.66,"to":4656.78,"location":2,"content":"to be sort of making this in by M matrix"},{"from":4656.78,"to":4659.15,"location":2,"content":"of our partial derivatives there's going"},{"from":4659.15,"to":4662.18,"location":2,"content":"to be the product of the error signal"},{"from":4662.18,"to":4664.63,"location":2,"content":"for the but the appropriate output"},{"from":4664.63,"to":4667.61,"location":2,"content":"multiplied by our input and those goes"},{"from":4667.61,"to":4672.44,"location":2,"content":"give us the partial derivatives"},{"from":4672.44,"to":4674.93,"location":2,"content":"I'm skipping ahead quickly in my last 1"},{"from":4674.93,"to":4680.48,"location":2,"content":"minute okay so right so this is sort of"},{"from":4680.48,"to":4682.01,"location":2,"content":"what I said have used the shape can"},{"from":4682.01,"to":4684.43,"location":2,"content":"convince and I'm going to skip that"},{"from":4684.43,"to":4688.97,"location":2,"content":"okay so I've I ran out of time a teeny"},{"from":4688.97,"to":4691.04,"location":2,"content":"bit at the end but I mean I think"},{"from":4691.04,"to":4694.52,"location":2,"content":"hopefully it's conveyed most of the idea"},{"from":4694.52,"to":4697.55,"location":2,"content":"of how you can sort of use the chain"},{"from":4697.55,"to":4700.82,"location":2,"content":"rule and work out the derivatives and"},{"from":4700.82,"to":4703.46,"location":2,"content":"work them out in terms of these vector"},{"from":4703.46,"to":4707.45,"location":2,"content":"and matrix derivatives and essentially"},{"from":4707.45,"to":4709.28,"location":2,"content":"what we want to do for back propagation"},{"from":4709.28,"to":4712.22,"location":2,"content":"is to say how can we do and get a"},{"from":4712.22,"to":4715.19,"location":2,"content":"computer to do this automatically for us"},{"from":4715.19,"to":4717.83,"location":2,"content":"and to do it efficiently and that's what"},{"from":4717.83,"to":4719.42,"location":2,"content":"sort of the deep learning frameworks"},{"from":4719.42,"to":4722.27,"location":2,"content":"like tensor flow and PI torch do and how"},{"from":4722.27,"to":4723.92,"location":2,"content":"you can do that we will look at more"},{"from":4723.92,"to":4726.28,"location":2,"content":"next time"}]} \ No newline at end of file diff --git a/bcc-en/4.bcc b/bcc-en/4.bcc new file mode 100644 index 0000000000000000000000000000000000000000..25f11cee05ddba32e9f6142d211b00d7936172cb --- /dev/null +++ b/bcc-en/4.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.1,"to":9.24,"location":2,"content":"okay so great to see everyone back for"},{"from":9.24,"to":13.96,"location":2,"content":"lecture four of the class so for lect"},{"from":13.96,"to":17.32,"location":2,"content":"for today's lecture what I want to do"},{"from":17.32,"to":20.26,"location":2,"content":"for most of the time is actually get"},{"from":20.26,"to":23.68,"location":2,"content":"into the heart of these ideas of having"},{"from":23.68,"to":25.39,"location":2,"content":"the back propagation algorithm for"},{"from":25.39,"to":28.02,"location":2,"content":"neural nets and how we can construct"},{"from":28.02,"to":30.34,"location":2,"content":"computation graphs that allow us"},{"from":30.34,"to":33.88,"location":2,"content":"efficiently to do back propagation your"},{"from":33.88,"to":38.37,"location":2,"content":"nets to train the neural Nets so overall"},{"from":38.37,"to":40.75,"location":2,"content":"this is sort of what I plan to do it"},{"from":40.75,"to":43.12,"location":2,"content":"today so at the end of last lecture I"},{"from":43.12,"to":45.76,"location":2,"content":"slightly ran out of time and I started"},{"from":45.76,"to":48.12,"location":2,"content":"mumbling and waving my hands about the"},{"from":48.12,"to":50.59,"location":2,"content":"doing the derivatives with respect to"},{"from":50.59,"to":52.33,"location":2,"content":"the weight gradient so I kind of wanted"},{"from":52.33,"to":55.24,"location":2,"content":"to do that bit again so hopefully it"},{"from":55.24,"to":58.3,"location":2,"content":"actually communicates slightly better so"},{"from":58.3,"to":60.67,"location":2,"content":"we'll do that and talk a bit more about"},{"from":60.67,"to":62.77,"location":2,"content":"sort of just tips for doing matrix"},{"from":62.77,"to":65.8,"location":2,"content":"gradients and a particular issue that"},{"from":65.8,"to":68.35,"location":2,"content":"comes up with word vectors and so then"},{"from":68.35,"to":70.24,"location":2,"content":"the main part of the class will be"},{"from":70.24,"to":71.59,"location":2,"content":"talking about the back propagation"},{"from":71.59,"to":74.17,"location":2,"content":"algorithm and how it runs over"},{"from":74.17,"to":77.26,"location":2,"content":"computation graphs and then for the last"},{"from":77.26,"to":80.83,"location":2,"content":"part of the class is I'm not going to"},{"from":80.83,"to":81.91,"location":2,"content":"hide it"},{"from":81.91,"to":84.75,"location":2,"content":"this is sort of just a grab bag of"},{"from":84.75,"to":86.95,"location":2,"content":"miscellaneous stuff you should know"},{"from":86.95,"to":90.13,"location":2,"content":"about neural networks and training"},{"from":90.13,"to":94.6,"location":2,"content":"neural networks like I think you know we"},{"from":94.6,"to":96.19,"location":2,"content":"dream of a future of artificial"},{"from":96.19,"to":98.68,"location":2,"content":"intelligence where our machines are"},{"from":98.68,"to":100.87,"location":2,"content":"really intelligent and you can just say"},{"from":100.87,"to":102.7,"location":2,"content":"to them this is the data and this is my"},{"from":102.7,"to":105.55,"location":2,"content":"problem go and train me a model and it"},{"from":105.55,"to":108.67,"location":2,"content":"might work and in some future world that"},{"from":108.67,"to":111.16,"location":2,"content":"maybe what comes along it's something"},{"from":111.16,"to":112.96,"location":2,"content":"that's certainly being actively"},{"from":112.96,"to":115.27,"location":2,"content":"researched at the moment under the topic"},{"from":115.27,"to":117.91,"location":2,"content":"of auto ml I guess the question is"},{"from":117.91,"to":120.4,"location":2,"content":"whether it turns out that auto ml is a"},{"from":120.4,"to":123.12,"location":2,"content":"scalable solution or the climate change"},{"from":123.12,"to":126.1,"location":2,"content":"consequences of auto ml techniques are"},{"from":126.1,"to":128.17,"location":2,"content":"sufficiently bad that someone actually"},{"from":128.17,"to":131.85,"location":2,"content":"decides that these much lower power"},{"from":131.85,"to":134.77,"location":2,"content":"neural systems might actually be better"},{"from":134.77,"to":136.42,"location":2,"content":"still for doing some parts of the"},{"from":136.42,"to":137.95,"location":2,"content":"problem but"},{"from":137.95,"to":139.6,"location":2,"content":"either way we're not really there yet"},{"from":139.6,"to":142.75,"location":2,"content":"and the fact of the matter is when"},{"from":142.75,"to":144.46,"location":2,"content":"you're training neural networks there's"},{"from":144.46,"to":147.04,"location":2,"content":"just a whole bunch of stuff you have to"},{"from":147.04,"to":148.9,"location":2,"content":"know about initialization and"},{"from":148.9,"to":151,"location":2,"content":"nonlinearities and learning rates and so"},{"from":151,"to":154.15,"location":2,"content":"on and you know well I taught this class"},{"from":154.15,"to":159.34,"location":2,"content":"last time I somehow thought that people"},{"from":159.34,"to":161.8,"location":2,"content":"would pick this up by osmosis that if we"},{"from":161.8,"to":165.66,"location":2,"content":"get gave starter cut code to people and"},{"from":165.66,"to":169.86,"location":2,"content":"in our starter code we initialized how"},{"from":169.86,"to":172.42,"location":2,"content":"matrices and we set our learning rates"},{"from":172.42,"to":174.79,"location":2,"content":"that by osmosis people would understand"},{"from":174.79,"to":178.78,"location":2,"content":"that's what you have to do and do it and"},{"from":178.78,"to":181.81,"location":2,"content":"didn't really sort of teach in class the"},{"from":181.81,"to":184.09,"location":2,"content":"practical tips and tricks enough but it"},{"from":184.09,"to":185.71,"location":2,"content":"was perfectly obvious that when we got"},{"from":185.71,"to":189.04,"location":2,"content":"to final project time there at least for"},{"from":189.04,"to":191.44,"location":2,"content":"quite a few people osmosis hadn't worked"},{"from":191.44,"to":195.01,"location":2,"content":"so this time I'm at least willing to"},{"from":195.01,"to":197.5,"location":2,"content":"spend a few minutes on that and at least"},{"from":197.5,"to":199.27,"location":2,"content":"point out some of the things that are"},{"from":199.27,"to":202.93,"location":2,"content":"important and I mean just in general you"},{"from":202.93,"to":206.59,"location":2,"content":"know the reality of 2018 deep learning"},{"from":206.59,"to":210.64,"location":2,"content":"paths now wait 2019 now 2019 deep"},{"from":210.64,"to":213.43,"location":2,"content":"learning is deep learning is still kind"},{"from":213.43,"to":215.47,"location":2,"content":"of a craft there's quite a bit you have"},{"from":215.47,"to":218.38,"location":2,"content":"to know of techniques of doing things"},{"from":218.38,"to":220.9,"location":2,"content":"that lead neural net training to work"},{"from":220.9,"to":223.47,"location":2,"content":"successfully as opposed to your models"},{"from":223.47,"to":227.08,"location":2,"content":"failing to work successfully okay one"},{"from":227.08,"to":229.93,"location":2,"content":"final announcement and I go in through"},{"from":229.93,"to":233.41,"location":2,"content":"it so we've sort of been doing some"},{"from":233.41,"to":237.46,"location":2,"content":"further working on office our placement"},{"from":237.46,"to":239.05,"location":2,"content":"and I guess there are sort of multiple"},{"from":239.05,"to":241.75,"location":2,"content":"issues which include opportunities for"},{"from":241.75,"to":244.24,"location":2,"content":"local SCPD students without Stanford IDs"},{"from":244.24,"to":248.41,"location":2,"content":"we have to get to office hours so for"},{"from":248.41,"to":250.72,"location":2,"content":"the Thursday night office hour"},{"from":250.72,"to":253.39,"location":2,"content":"that's after this class if you'd like to"},{"from":253.39,"to":255.9,"location":2,"content":"go and talk about the second homework"},{"from":255.9,"to":258.64,"location":2,"content":"the Thursday night office hour is going"},{"from":258.64,"to":262.06,"location":2,"content":"to be in thought at Thornton 110 now I"},{"from":262.06,"to":264.7,"location":2,"content":"didn't know where Thornton was it made"},{"from":264.7,"to":267.19,"location":2,"content":"more sense to me when I translated that"},{"from":267.19,"to":269.44,"location":2,"content":"as that's the old Terman annex but"},{"from":269.44,"to":271.27,"location":2,"content":"that's probably just showing my age"},{"from":271.27,"to":273.76,"location":2,"content":"since probably none of you remember when"},{"from":273.76,"to":274.93,"location":2,"content":"there used to be a building called"},{"from":274.93,"to":276.7,"location":2,"content":"Thurmond so that probably doesn't help"},{"from":276.7,"to":279.28,"location":2,"content":"you either but you know if you're"},{"from":279.28,"to":281.17,"location":2,"content":"heading I don't know which direction"},{"from":281.17,"to":283.63,"location":2,"content":"we're facing if you're heading that way"},{"from":283.63,"to":286.75,"location":2,"content":"I guess and if you know where the pop"},{"from":286.75,"to":289.3,"location":2,"content":"your new guinea sculpture garden is the"},{"from":289.3,"to":292.24,"location":2,"content":"the sort of open grassy area before you"},{"from":292.24,"to":293.95,"location":2,"content":"get to the Papua New Guinea sculpture"},{"from":293.95,"to":294.55,"location":2,"content":"garden"},{"from":294.55,"to":296.62,"location":2,"content":"that's where Turman used to be and the"},{"from":296.62,"to":298.45,"location":2,"content":"building that still stands in there is"},{"from":298.45,"to":303.01,"location":2,"content":"thornton thornton 110 tonight i think it"},{"from":303.01,"to":308.44,"location":2,"content":"starts at 6:30 right 639 okay right so"},{"from":308.44,"to":310.33,"location":2,"content":"let me just finish off where we were"},{"from":310.33,"to":312.52,"location":2,"content":"last time so remember we had this window"},{"from":312.52,"to":315.43,"location":2,"content":"of five words and then we were putting"},{"from":315.43,"to":318.64,"location":2,"content":"it through a new net layer of Z equals W"},{"from":318.64,"to":321.43,"location":2,"content":"X plus B non-linearity of H equals f of"},{"from":321.43,"to":324.61,"location":2,"content":"X and then we are going to just get a"},{"from":324.61,"to":327.58,"location":2,"content":"score as to whether this has in its"},{"from":327.58,"to":331.39,"location":2,"content":"centre named entity like Paris which is"},{"from":331.39,"to":333.55,"location":2,"content":"sort of taking this dot product of a"},{"from":333.55,"to":335.35,"location":2,"content":"vector times the hidden layer so this"},{"from":335.35,"to":337.69,"location":2,"content":"was our model and then we were wanting"},{"from":337.69,"to":340.72,"location":2,"content":"to work out partial derivatives of s"},{"from":340.72,"to":343.15,"location":2,"content":"with respect to all of our variables and"},{"from":343.15,"to":346.18,"location":2,"content":"we did various of the cases but one we"},{"from":346.18,"to":348.28,"location":2,"content":"handy had done is the weights and the"},{"from":348.28,"to":349.84,"location":2,"content":"weights or all of this neural net layer"},{"from":349.84,"to":350.5,"location":2,"content":"here"},{"from":350.5,"to":354.37,"location":2,"content":"okay so chain rule the partial of the"},{"from":354.37,"to":360.07,"location":2,"content":"SDW of is the s times h d the h DZ times"},{"from":360.07,"to":364.03,"location":2,"content":"DZ d w and well if you remember last"},{"from":364.03,"to":367.21,"location":2,"content":"time we'd sort of done some computation"},{"from":367.21,"to":370.69,"location":2,"content":"of what those first two partial"},{"from":370.69,"to":373.33,"location":2,"content":"derivatives were and we said that we"},{"from":373.33,"to":376.54,"location":2,"content":"could just call those delta which is our"},{"from":376.54,"to":379.9,"location":2,"content":"error signal coming from above and that"},{"from":379.9,"to":381.79,"location":2,"content":"concept of having an error signal coming"},{"from":381.79,"to":384.1,"location":2,"content":"from above is something I'll get back to"},{"from":384.1,"to":385.54,"location":2,"content":"in the main part of the lecture and a"},{"from":385.54,"to":387.61,"location":2,"content":"sort of a central notion but the bit we"},{"from":387.61,"to":391.03,"location":2,"content":"haven't dealt with is this DZ DW and we"},{"from":391.03,"to":393.85,"location":2,"content":"started to look at it and I made the"},{"from":393.85,"to":397.72,"location":2,"content":"argument based on our shape convention"},{"from":397.72,"to":400.15,"location":2,"content":"that the shape of that should be the"},{"from":400.15,"to":402.73,"location":2,"content":"same shape as our W matrix so it should"},{"from":402.73,"to":404.98,"location":2,"content":"be the same in time"},{"from":404.98,"to":407.77,"location":2,"content":"in shape as this w matrix so we want to"},{"from":407.77,"to":412.99,"location":2,"content":"work out the partial of Z by W which is"},{"from":412.99,"to":419.92,"location":2,"content":"the same as this ZW x + B DW and so we"},{"from":419.92,"to":422.46,"location":2,"content":"want to work out what that derivative is"},{"from":422.46,"to":425.38,"location":2,"content":"and if that's not obvious one way to"},{"from":425.38,"to":428.2,"location":2,"content":"think about it is to go back to this"},{"from":428.2,"to":430.18,"location":2,"content":"elements of the matrix and actually"},{"from":430.18,"to":432.94,"location":2,"content":"first off work it out element wise and"},{"from":432.94,"to":435.31,"location":2,"content":"think out what it should be and then"},{"from":435.31,"to":436.69,"location":2,"content":"once you've thought out what it should"},{"from":436.69,"to":440.38,"location":2,"content":"be to rewrite it back in matrix form to"},{"from":440.38,"to":442.93,"location":2,"content":"give the compact answer so what we have"},{"from":442.93,"to":445.63,"location":2,"content":"is we have these inputs here and a bias"},{"from":445.63,"to":448.72,"location":2,"content":"term and we're going to do the matrix"},{"from":448.72,"to":451.06,"location":2,"content":"multiply of this vector to produce these"},{"from":451.06,"to":453.73,"location":2,"content":"and if you think about what's happening"},{"from":453.73,"to":456.37,"location":2,"content":"there so we've got this matrix of"},{"from":456.37,"to":459.37,"location":2,"content":"weights and for a particular weight a"},{"from":459.37,"to":462.22,"location":2,"content":"weight is first index is going to"},{"from":462.22,"to":465.37,"location":2,"content":"correspond to a position in the hidden"},{"from":465.37,"to":468.97,"location":2,"content":"layer and it's second index is going to"},{"from":468.97,"to":472.15,"location":2,"content":"correspond to a position in the input"},{"from":472.15,"to":475.96,"location":2,"content":"vector and one ways in the matrix ends"},{"from":475.96,"to":478.87,"location":2,"content":"up being part of what's used to compute"},{"from":478.87,"to":481.57,"location":2,"content":"one element of the hidden layer so the"},{"from":481.57,"to":483.49,"location":2,"content":"one element of the hidden layer you're"},{"from":483.49,"to":487.27,"location":2,"content":"taking a row of the matrix and you're"},{"from":487.27,"to":489.25,"location":2,"content":"multiplying it by the components of this"},{"from":489.25,"to":491.38,"location":2,"content":"vector so they sum together and the bias"},{"from":491.38,"to":493.51,"location":2,"content":"is added on but one element of the"},{"from":493.51,"to":495.94,"location":2,"content":"matrix is sort of only being used in a"},{"from":495.94,"to":498.93,"location":2,"content":"computation between one element of the"},{"from":498.93,"to":501.07,"location":2,"content":"important one element of the hidden"},{"from":501.07,"to":505.45,"location":2,"content":"vector okay so well that means if we're"},{"from":505.45,"to":507.04,"location":2,"content":"thinking about what's the partial"},{"from":507.04,"to":510.16,"location":2,"content":"derivative with respect to W IJ"},{"from":510.16,"to":516.45,"location":2,"content":"well it's only contributing to Zi and"},{"from":516.45,"to":521.04,"location":2,"content":"it's only it's only doing anything with"},{"from":521.04,"to":525.22,"location":2,"content":"XJ so that we end up with when we're"},{"from":525.22,"to":527.07,"location":2,"content":"getting the partial with respect to W IJ"},{"from":527.07,"to":530.11,"location":2,"content":"we can work that out with respect to"},{"from":530.11,"to":534.67,"location":2,"content":"just respect Zi and when we're going to"},{"from":534.67,"to":537.19,"location":2,"content":"look at this multiplication here what"},{"from":537.19,"to":538.81,"location":2,"content":"we're ending up is this sort of"},{"from":538.81,"to":542.95,"location":2,"content":"of terms wik times XK where there's sort"},{"from":542.95,"to":544.87,"location":2,"content":"of weights in that row of the matrix"},{"from":544.87,"to":547.39,"location":2,"content":"going across the positions of the vector"},{"from":547.39,"to":551.53,"location":2,"content":"so the only position in which W IJ is"},{"from":551.53,"to":557.62,"location":2,"content":"used is multiplying by XJ and at that"},{"from":557.62,"to":560.29,"location":2,"content":"point what we have in terms of sort of"},{"from":560.29,"to":563.14,"location":2,"content":"you know basic one variable doing a"},{"from":563.14,"to":565.48,"location":2,"content":"differentiation this is just like we"},{"from":565.48,"to":568.84,"location":2,"content":"have three X and we say what's the"},{"from":568.84,"to":571.51,"location":2,"content":"derivative of 3x max just X is confusing"},{"from":571.51,"to":573.34,"location":2,"content":"sorry I shouldn't say that it's like we"},{"from":573.34,"to":576.04,"location":2,"content":"have three W and what's the derivative"},{"from":576.04,"to":579.01,"location":2,"content":"of three W with respect to W it's three"},{"from":579.01,"to":582.22,"location":2,"content":"right so that we've have a term here"},{"from":582.22,"to":585.85,"location":2,"content":"which is will have been W will be W IJ"},{"from":585.85,"to":589.21,"location":2,"content":"times XJ and its derivative with respect"},{"from":589.21,"to":594.01,"location":2,"content":"to W IJ is just XJ that makes sense if"},{"from":594.01,"to":597.22,"location":2,"content":"you want to leave it fingers crossed"},{"from":597.22,"to":601.03,"location":2,"content":"okay so so for one element of this"},{"from":601.03,"to":604.57,"location":2,"content":"matrix we're just getting out XJ and at"},{"from":604.57,"to":608.77,"location":2,"content":"that point we say well of course we want"},{"from":608.77,"to":612.04,"location":2,"content":"to know what the Jacobian is for the"},{"from":612.04,"to":614.95,"location":2,"content":"full matrix W well if you start thinking"},{"from":614.95,"to":617.65,"location":2,"content":"about it this argument applies to every"},{"from":617.65,"to":622.33,"location":2,"content":"cell so that for every cell of the"},{"from":622.33,"to":626.68,"location":2,"content":"Jacobian for W it's going to be XJ so"},{"from":626.68,"to":631.72,"location":2,"content":"that means we're just going to be able"},{"from":631.72,"to":634.18,"location":2,"content":"to make use of that in calculating our"},{"from":634.18,"to":636.73,"location":2,"content":"Jacobian so the derivative for a single"},{"from":636.73,"to":641.23,"location":2,"content":"W IJ is Delta I XJ and that's true for"},{"from":641.23,"to":645.04,"location":2,"content":"all cells so we want to have a matrix"},{"from":645.04,"to":650.35,"location":2,"content":"for our Jacobian which has Delta I XJ in"},{"from":650.35,"to":652.48,"location":2,"content":"every cell Evert and the way we can"},{"from":652.48,"to":655.66,"location":2,"content":"create that is by using an outer product"},{"from":655.66,"to":659.89,"location":2,"content":"so if we have a row vector of the deltas"},{"from":659.89,"to":662.23,"location":2,"content":"the error signals from above and a"},{"from":662.23,"to":666.34,"location":2,"content":"column wait I say that wrong sorry if we"},{"from":666.34,"to":671.08,"location":2,"content":"have a column of the Delta arrow signals"},{"from":671.08,"to":672.7,"location":2,"content":"from above"},{"from":672.7,"to":678.01,"location":2,"content":"we have a row of X transpose vector when"},{"from":678.01,"to":680.83,"location":2,"content":"we multiply those together we get the"},{"from":680.83,"to":683.8,"location":2,"content":"outer product and we get Delta I XJ in"},{"from":683.8,"to":686.23,"location":2,"content":"each cell and that is our to Co be an"},{"from":686.23,"to":691.21,"location":2,"content":"answer for working out the Delta s Delta"},{"from":691.21,"to":692.95,"location":2,"content":"W that we started off with at the"},{"from":692.95,"to":697.42,"location":2,"content":"beginning okay and this and we get this"},{"from":697.42,"to":700.15,"location":2,"content":"form where it's a multiplication of an"},{"from":700.15,"to":703.12,"location":2,"content":"error signal from above and our computed"},{"from":703.12,"to":705.4,"location":2,"content":"local gradient signal and that's the"},{"from":705.4,"to":707.32,"location":2,"content":"pattern that we're going to see over and"},{"from":707.32,"to":709.6,"location":2,"content":"over again and that will exploit in our"},{"from":709.6,"to":714.7,"location":2,"content":"computation graphs okay all good okay"},{"from":714.7,"to":720.73,"location":2,"content":"so here's just you know homework two"},{"from":720.73,"to":722.88,"location":2,"content":"you're meant to do some of this stuff"},{"from":722.88,"to":725.98,"location":2,"content":"here it s sort of a couple of collected"},{"from":725.98,"to":730.33,"location":2,"content":"tips which I hope will help I mean"},{"from":730.33,"to":733.45,"location":2,"content":"keeping here track of your variables and"},{"from":733.45,"to":735.82,"location":2,"content":"their dimensionality is really useful"},{"from":735.82,"to":737.17,"location":2,"content":"because if you just can work out what"},{"from":737.17,"to":738.67,"location":2,"content":"the dimensionality of things should be"},{"from":738.67,"to":741.28,"location":2,"content":"you're often kind of halfway there"},{"from":741.28,"to":743.53,"location":2,"content":"I mean basically what you're doing is"},{"from":743.53,"to":746.26,"location":2,"content":"sort of applying the chain rule over and"},{"from":746.26,"to":749.23,"location":2,"content":"over again it always looks like this but"},{"from":749.23,"to":751.66,"location":2,"content":"doing it in this sort of matrix calculus"},{"from":751.66,"to":755.83,"location":2,"content":"sense of the chain rule in the homework"},{"from":755.83,"to":757.87,"location":2,"content":"you have to do a softmax which we"},{"from":757.87,"to":760.27,"location":2,"content":"haven't done in class something that I"},{"from":760.27,"to":762.79,"location":2,"content":"think you'll find useful if you want to"},{"from":762.79,"to":765.49,"location":2,"content":"break apart the softmax is to consider"},{"from":765.49,"to":769.24,"location":2,"content":"two cases one the cases - when you're"},{"from":769.24,"to":771.58,"location":2,"content":"working it out for the correct class and"},{"from":771.58,"to":774.64,"location":2,"content":"then the other case is for all the other"},{"from":774.64,"to":779.8,"location":2,"content":"incorrect classes yeah I'm in the in the"},{"from":779.8,"to":782.5,"location":2,"content":"little derivation I did before I said"},{"from":782.5,"to":784.54,"location":2,"content":"well let's work out an element-wise"},{"from":784.54,"to":786.82,"location":2,"content":"partial derivative because that should"},{"from":786.82,"to":788.53,"location":2,"content":"give me some sense of what's going on"},{"from":788.53,"to":790.66,"location":2,"content":"what the answer is I think that can be a"},{"from":790.66,"to":792.4,"location":2,"content":"really good thing to do if you're"},{"from":792.4,"to":794.62,"location":2,"content":"getting confused by matrix calculus and"},{"from":794.62,"to":800.02,"location":2,"content":"I sort of slightly skip past another"},{"from":800.02,"to":802.42,"location":2,"content":"slide last time I was talking about the"},{"from":802.42,"to":804.1,"location":2,"content":"shape convention that I talked about it"},{"from":804.1,"to":805.9,"location":2,"content":"for a moment but for"},{"from":805.9,"to":809.65,"location":2,"content":"or the homeworks you can work out your"},{"from":809.65,"to":812.14,"location":2,"content":"answer however you want you can work it"},{"from":812.14,"to":814.36,"location":2,"content":"out in terms of you know numerator"},{"from":814.36,"to":816.73,"location":2,"content":"ordered jacobians if that seems best to"},{"from":816.73,"to":819.25,"location":2,"content":"you but we'd like you to give the final"},{"from":819.25,"to":821.35,"location":2,"content":"answer to your assignment questions"},{"from":821.35,"to":824.26,"location":2,"content":"following the shape convention so that"},{"from":824.26,"to":826.81,"location":2,"content":"the derivatives should be shaped in a"},{"from":826.81,"to":829.9,"location":2,"content":"vector or matrix in the same way as the"},{"from":829.9,"to":832.03,"location":2,"content":"variable with respect to which you're"},{"from":832.03,"to":835.08,"location":2,"content":"working out your derivatives"},{"from":835.08,"to":838.78,"location":2,"content":"okay the last little bit for finishing"},{"from":838.78,"to":840.55,"location":2,"content":"up this example from last time I want to"},{"from":840.55,"to":842.89,"location":2,"content":"say a little bit about is what happens"},{"from":842.89,"to":847.27,"location":2,"content":"with words and one answer is nothing"},{"from":847.27,"to":850,"location":2,"content":"different but another answer is they are"},{"from":850,"to":851.62,"location":2,"content":"a little bit of a special case here"},{"from":851.62,"to":855.19,"location":2,"content":"because you know really we have a matrix"},{"from":855.19,"to":857.41,"location":2,"content":"of word vectors right we have a vector"},{"from":857.41,"to":860.71,"location":2,"content":"for each word and so then you can think"},{"from":860.71,"to":862.93,"location":2,"content":"of that as sort of this matrix of word"},{"from":862.93,"to":864.79,"location":2,"content":"vectors which row as a different word"},{"from":864.79,"to":867.64,"location":2,"content":"but we're not actually kind of"},{"from":867.64,"to":871,"location":2,"content":"connecting up that matrix directly to"},{"from":871,"to":873.79,"location":2,"content":"our classifier system instead of that"},{"from":873.79,"to":875.44,"location":2,"content":"what we connect connecting up to the"},{"from":875.44,"to":878.56,"location":2,"content":"classifier system is this window and the"},{"from":878.56,"to":880.93,"location":2,"content":"window we'll have it in at five words"},{"from":880.93,"to":883.24,"location":2,"content":"most commonly they're different words"},{"from":883.24,"to":885.13,"location":2,"content":"but you know occasionally the same word"},{"from":885.13,"to":888.16,"location":2,"content":"might appear in two positions in that"},{"from":888.16,"to":892.09,"location":2,"content":"window and so we can nevertheless do"},{"from":892.09,"to":894.37,"location":2,"content":"exactly the same thing and continue our"},{"from":894.37,"to":898.51,"location":2,"content":"gradients down and say okay let's work"},{"from":898.51,"to":902.5,"location":2,"content":"out the gradients of this word window"},{"from":902.5,"to":906.58,"location":2,"content":"vector and if these are of dimension D"},{"from":906.58,"to":909.52,"location":2,"content":"we'll have this sort of five D vector"},{"from":909.52,"to":912.73,"location":2,"content":"but you know then what do we do about it"},{"from":912.73,"to":915.1,"location":2,"content":"and the answer to what we do about it is"},{"from":915.1,"to":918.67,"location":2,"content":"we can just sort of split this window"},{"from":918.67,"to":921.97,"location":2,"content":"vector into five pieces and say aha"},{"from":921.97,"to":924.76,"location":2,"content":"we have five updates to word vectors"},{"from":924.76,"to":926.59,"location":2,"content":"we're just going to go off and apply"},{"from":926.59,"to":931.51,"location":2,"content":"them to the words vector matrix and you"},{"from":931.51,"to":934.47,"location":2,"content":"know if we if the same word occurs twice"},{"from":934.47,"to":938.14,"location":2,"content":"in that window we literally apply both"},{"from":938.14,"to":939.16,"location":2,"content":"of the updates so"},{"from":939.16,"to":941.62,"location":2,"content":"it is it's updated twice or maybe"},{"from":941.62,"to":943.15,"location":2,"content":"actually you want to sum them first and"},{"from":943.15,"to":944.89,"location":2,"content":"then do the update once but yeah that's"},{"from":944.89,"to":949.51,"location":2,"content":"a technical issue so so what that"},{"from":949.51,"to":951.87,"location":2,"content":"actually means is that we're extremely"},{"from":951.87,"to":956.38,"location":2,"content":"sparsely updating the word vector matrix"},{"from":956.38,"to":958.21,"location":2,"content":"because most of the word vector matrix"},{"from":958.21,"to":961.06,"location":2,"content":"will be unchanged and just a few rows of"},{"from":961.06,"to":964.81,"location":2,"content":"it will be being updated and if soon"},{"from":964.81,"to":966.37,"location":2,"content":"we're going to be you know doing stuff"},{"from":966.37,"to":969.4,"location":2,"content":"with PI torch and if you poke around PI"},{"from":969.4,"to":972.43,"location":2,"content":"torch even has some special stuff look"},{"from":972.43,"to":975.37,"location":2,"content":"for things like sparse SGD for meaning"},{"from":975.37,"to":977.05,"location":2,"content":"that you're sort of doing a very sparse"},{"from":977.05,"to":981.1,"location":2,"content":"updating like that but there's one other"},{"from":981.1,"to":983.77,"location":2,"content":"sort of interesting thing that you"},{"from":983.77,"to":986.65,"location":2,"content":"should know about for a lot of things"},{"from":986.65,"to":988.42,"location":2,"content":"that you do is just what actually"},{"from":988.42,"to":991.3,"location":2,"content":"happens if we push down these gradients"},{"from":991.3,"to":994.78,"location":2,"content":"into our word vectors well the idea is"},{"from":994.78,"to":996.94,"location":2,"content":"you know if we do that would be just"},{"from":996.94,"to":999.34,"location":2,"content":"like all other neural net learning that"},{"from":999.34,"to":1003.27,"location":2,"content":"we will sort of in principle say move"},{"from":1003.27,"to":1006.57,"location":2,"content":"the word vectors around in such a way as"},{"from":1006.57,"to":1009.51,"location":2,"content":"they're more useful in helping determine"},{"from":1009.51,"to":1012.21,"location":2,"content":"named entity classification in this case"},{"from":1012.21,"to":1014.21,"location":2,"content":"because that was our motivating example"},{"from":1014.21,"to":1017.13,"location":2,"content":"so you know it might for example learn"},{"from":1017.13,"to":1019.77,"location":2,"content":"that the word in is a very good"},{"from":1019.77,"to":1023.28,"location":2,"content":"indicator of named into default I'm"},{"from":1023.28,"to":1025.53,"location":2,"content":"sorry the place name following so after"},{"from":1025.53,"to":1028.35,"location":2,"content":"in you often get London Paris etc right"},{"from":1028.35,"to":1030,"location":2,"content":"so it's sort of got a special behavior"},{"from":1030,"to":1032.67,"location":2,"content":"that other prepositions don't as being a"},{"from":1032.67,"to":1035.16,"location":2,"content":"good location indicator and so it could"},{"from":1035.16,"to":1038.49,"location":2,"content":"sort of move its location around and say"},{"from":1038.49,"to":1041.25,"location":2,"content":"here are words that are good location"},{"from":1041.25,"to":1043.41,"location":2,"content":"indicators and therefore help our"},{"from":1043.41,"to":1046.71,"location":2,"content":"classifier work even better so on"},{"from":1046.71,"to":1048.99,"location":2,"content":"principle that's good and it's a good"},{"from":1048.99,"to":1052.14,"location":2,"content":"thing to do to update word vectors to"},{"from":1052.14,"to":1054.69,"location":2,"content":"help you perform better on a supervised"},{"from":1054.69,"to":1057.39,"location":2,"content":"tasks such as this named entity"},{"from":1057.39,"to":1060.27,"location":2,"content":"recognition classification but there's a"},{"from":1060.27,"to":1063.51,"location":2,"content":"catch which is that it doesn't always"},{"from":1063.51,"to":1066.12,"location":2,"content":"work actually and so why doesn't it"},{"from":1066.12,"to":1068.46,"location":2,"content":"always work well suppose that we're"},{"from":1068.46,"to":1071.79,"location":2,"content":"training a classifier you know it could"},{"from":1071.79,"to":1072.61,"location":2,"content":"be the"},{"from":1072.61,"to":1075.4,"location":2,"content":"one I just did or a softmax or logistic"},{"from":1075.4,"to":1079.11,"location":2,"content":"regression and we're wanting to classify"},{"from":1079.11,"to":1081.88,"location":2,"content":"movie reviews sentiment for positive or"},{"from":1081.88,"to":1085.69,"location":2,"content":"negative well you know if we have"},{"from":1085.69,"to":1088.3,"location":2,"content":"trained our word vectors we've got some"},{"from":1088.3,"to":1090.97,"location":2,"content":"word vector space and maybe in the word"},{"from":1090.97,"to":1094.87,"location":2,"content":"vector space TV telly and television are"},{"from":1094.87,"to":1097.15,"location":2,"content":"all very close together because they"},{"from":1097.15,"to":1100,"location":2,"content":"mean basically the same thing so that's"},{"from":1100,"to":1103.18,"location":2,"content":"great our word vectors are good but well"},{"from":1103.18,"to":1106.12,"location":2,"content":"suppose it was the case that in our"},{"from":1106.12,"to":1109.09,"location":2,"content":"training data for our classifier so this"},{"from":1109.09,"to":1111.76,"location":2,"content":"is our training data for movie sentiment"},{"from":1111.76,"to":1115.81,"location":2,"content":"review we had the word TV and telly but"},{"from":1115.81,"to":1118.72,"location":2,"content":"we didn't have the word television well"},{"from":1118.72,"to":1121.32,"location":2,"content":"then what's going to happen is well"},{"from":1121.32,"to":1124.27,"location":2,"content":"while we try and train our sentiment"},{"from":1124.27,"to":1127.69,"location":2,"content":"classifier if we push gradient back down"},{"from":1127.69,"to":1130.36,"location":2,"content":"into the word vectors what's likely to"},{"from":1130.36,"to":1133.9,"location":2,"content":"happen is that it will move around the"},{"from":1133.9,"to":1136.63,"location":2,"content":"word vectors of the words we saw in the"},{"from":1136.63,"to":1139.12,"location":2,"content":"training data but necessarily"},{"from":1139.12,"to":1141.22,"location":2,"content":"televisions not moving right because"},{"from":1141.22,"to":1143.38,"location":2,"content":"we're only pushing gradient down to"},{"from":1143.38,"to":1145.6,"location":2,"content":"words or in our training data so this"},{"from":1145.6,"to":1147.91,"location":2,"content":"word goes nowhere so it just stays where"},{"from":1147.91,"to":1150.79,"location":2,"content":"it was all along so if the result of our"},{"from":1150.79,"to":1153.91,"location":2,"content":"training is words get moved around so"},{"from":1153.91,"to":1155.5,"location":2,"content":"here are good words for indicating"},{"from":1155.5,"to":1159.19,"location":2,"content":"negative sentiment will actually if a"},{"from":1159.19,"to":1162.1,"location":2,"content":"test time when we're running our model"},{"from":1162.1,"to":1164.2,"location":2,"content":"if we evaluate on a sentence with"},{"from":1164.2,"to":1166.12,"location":2,"content":"television in it it's actually going to"},{"from":1166.12,"to":1168.55,"location":2,"content":"give the wrong answer whereas if we"},{"from":1168.55,"to":1171.13,"location":2,"content":"haven't changed the word vectors at all"},{"from":1171.13,"to":1175.09,"location":2,"content":"and it just left them where our word"},{"from":1175.09,"to":1177.82,"location":2,"content":"embedding learning system put them then"},{"from":1177.82,"to":1179.86,"location":2,"content":"it would have said television that's a"},{"from":1179.86,"to":1182.08,"location":2,"content":"word that means about the same as TV or"},{"from":1182.08,"to":1183.97,"location":2,"content":"tele I should treat it the same in my"},{"from":1183.97,"to":1185.98,"location":2,"content":"sentiment classifier and it would"},{"from":1185.98,"to":1189.06,"location":2,"content":"actually do a better job so it's sort of"},{"from":1189.06,"to":1191.86,"location":2,"content":"two-sided whether you've gained by"},{"from":1191.86,"to":1195.79,"location":2,"content":"training word vectors and so this is a"},{"from":1195.79,"to":1198.91,"location":2,"content":"summary that says that it's two-sided"},{"from":1198.91,"to":1202.03,"location":2,"content":"and practically what you should do so"},{"from":1202.03,"to":1204.84,"location":2,"content":"the first choice is gee"},{"from":1204.84,"to":1208.02,"location":2,"content":"it's a good idea to use pre-trained word"},{"from":1208.02,"to":1210.15,"location":2,"content":"vectors like the word to vech vech ters"},{"from":1210.15,"to":1213.09,"location":2,"content":"that used an assignment one or using the"},{"from":1213.09,"to":1215.19,"location":2,"content":"training methods that you're doing right"},{"from":1215.19,"to":1218.16,"location":2,"content":"now for homework two and the answer that"},{"from":1218.16,"to":1221.7,"location":2,"content":"is almost always yes and the reason for"},{"from":1221.7,"to":1223.98,"location":2,"content":"that is these word vector training"},{"from":1223.98,"to":1227.82,"location":2,"content":"methods are extremely easy to run on"},{"from":1227.82,"to":1230.22,"location":2,"content":"billions and words of text so we you"},{"from":1230.22,"to":1232.02,"location":2,"content":"know train these models like glove or"},{"from":1232.02,"to":1234.57,"location":2,"content":"word thick on billions or tens of"},{"from":1234.57,"to":1236.76,"location":2,"content":"billions of words and it's easy to do"},{"from":1236.76,"to":1239.28,"location":2,"content":"that for two reasons firstly because the"},{"from":1239.28,"to":1241.14,"location":2,"content":"training algorithms are very simple"},{"from":1241.14,"to":1243.72,"location":2,"content":"right that the word defect training"},{"from":1243.72,"to":1246,"location":2,"content":"algorithm skip Graham's very simple"},{"from":1246,"to":1249.36,"location":2,"content":"algorithm secondly because we don't need"},{"from":1249.36,"to":1251.43,"location":2,"content":"any expensive resources all we need is a"},{"from":1251.43,"to":1253.56,"location":2,"content":"big pile of text documents and we can"},{"from":1253.56,"to":1256.35,"location":2,"content":"run it on them so really easy to run it"},{"from":1256.35,"to":1258.77,"location":2,"content":"on you know five or fifty billion words"},{"from":1258.77,"to":1261.9,"location":2,"content":"whereas you know we can't do that for"},{"from":1261.9,"to":1263.61,"location":2,"content":"most of the classifiers that we want to"},{"from":1263.61,"to":1264.87,"location":2,"content":"build because if it's something like a"},{"from":1264.87,"to":1266.64,"location":2,"content":"sentiment classifier or a named Indy"},{"from":1266.64,"to":1269.67,"location":2,"content":"recognizer we need label training data"},{"from":1269.67,"to":1272.73,"location":2,"content":"to train our classifier and then we ask"},{"from":1272.73,"to":1274.56,"location":2,"content":"someone or how many words of label"},{"from":1274.56,"to":1276.42,"location":2,"content":"training data do you have for named"},{"from":1276.42,"to":1278.7,"location":2,"content":"entity recognition and they give us back"},{"from":1278.7,"to":1280.44,"location":2,"content":"a number like three hundred thousand"},{"from":1280.44,"to":1282.18,"location":2,"content":"words or 1 million words right it's"},{"from":1282.18,"to":1286.56,"location":2,"content":"orders of magnitude smaller ok so"},{"from":1286.56,"to":1289.47,"location":2,"content":"therefore we can gain using pre trained"},{"from":1289.47,"to":1291.75,"location":2,"content":"word vectors because they know about all"},{"from":1291.75,"to":1293.85,"location":2,"content":"the words that aren't in our supervised"},{"from":1293.85,"to":1296.19,"location":2,"content":"classifiers training data and they also"},{"from":1296.19,"to":1297.72,"location":2,"content":"know much more about the words that"},{"from":1297.72,"to":1299.61,"location":2,"content":"actually are in the training data but"},{"from":1299.61,"to":1302.37,"location":2,"content":"only rarely so the exception to that is"},{"from":1302.37,"to":1304.32,"location":2,"content":"if you have hundreds of millions of"},{"from":1304.32,"to":1306.45,"location":2,"content":"words of data then you can start off"},{"from":1306.45,"to":1309.48,"location":2,"content":"with random word vectors and go from"},{"from":1309.48,"to":1311.25,"location":2,"content":"there and so a case where this is"},{"from":1311.25,"to":1313.53,"location":2,"content":"actually commonly done is for machine"},{"from":1313.53,"to":1315.36,"location":2,"content":"translation which we do later in the"},{"from":1315.36,"to":1318.77,"location":2,"content":"class it's relatively easy for large"},{"from":1318.77,"to":1321.63,"location":2,"content":"languages to get hundreds of millions of"},{"from":1321.63,"to":1323.7,"location":2,"content":"words of translated text if you want to"},{"from":1323.7,"to":1326.64,"location":2,"content":"build something like German English or"},{"from":1326.64,"to":1328.89,"location":2,"content":"Chinese English machine translation"},{"from":1328.89,"to":1331.62,"location":2,"content":"system not hard to get a hundred and"},{"from":1331.62,"to":1334.2,"location":2,"content":"fifty million words of translated text"},{"from":1334.2,"to":1336,"location":2,"content":"and so that's sort of sufficiently much"},{"from":1336,"to":1338.55,"location":2,"content":"data that people commonly just"},{"from":1338.55,"to":1341.88,"location":2,"content":"start with word vectors being randomly"},{"from":1341.88,"to":1345,"location":2,"content":"initialized and start training their"},{"from":1345,"to":1348.78,"location":2,"content":"translation system okay so in the second"},{"from":1348.78,"to":1350.79,"location":2,"content":"question is okay I'm using pre trained"},{"from":1350.79,"to":1354.54,"location":2,"content":"word vectors when I train my supervised"},{"from":1354.54,"to":1357.42,"location":2,"content":"classifier should I push gradients down"},{"from":1357.42,"to":1359.64,"location":2,"content":"into the word vectors and up and update"},{"from":1359.64,"to":1361.38,"location":2,"content":"them which is often referred to as"},{"from":1361.38,"to":1365.31,"location":2,"content":"fine-tuning the word vectors or should I"},{"from":1365.31,"to":1367.56,"location":2,"content":"not should I just sort of throw away"},{"from":1367.56,"to":1369.06,"location":2,"content":"those gradients and not push them down"},{"from":1369.06,"to":1371.97,"location":2,"content":"into the word vectors and you know the"},{"from":1371.97,"to":1374.01,"location":2,"content":"answer that is it depends and it just"},{"from":1374.01,"to":1376.44,"location":2,"content":"depends on the size so if you only have"},{"from":1376.44,"to":1379.82,"location":2,"content":"a small training data set"},{"from":1379.82,"to":1382.89,"location":2,"content":"typically it's best to just treat the"},{"from":1382.89,"to":1386.46,"location":2,"content":"pre trained word vectors as fixed and"},{"from":1386.46,"to":1388.77,"location":2,"content":"not do any updating of them at all if"},{"from":1388.77,"to":1392.22,"location":2,"content":"you have a large data set then you can"},{"from":1392.22,"to":1395.31,"location":2,"content":"normally gain by doing fine-tuning of"},{"from":1395.31,"to":1397.38,"location":2,"content":"the word vectors and of course the"},{"from":1397.38,"to":1400.53,"location":2,"content":"answer here is what counts as large you"},{"from":1400.53,"to":1402.63,"location":2,"content":"know if certainly if you're down in the"},{"from":1402.63,"to":1404.37,"location":2,"content":"regime of a hundred thousand words a"},{"from":1404.37,"to":1406.02,"location":2,"content":"couple hundred thousand words you're"},{"from":1406.02,"to":1409.23,"location":2,"content":"small if your time to be over a million"},{"from":1409.23,"to":1411.21,"location":2,"content":"words then maybe you're large but you"},{"from":1411.21,"to":1412.65,"location":2,"content":"know and practice people do it both ways"},{"from":1412.65,"to":1414.18,"location":2,"content":"and see which number is higher and"},{"from":1414.18,"to":1419.28,"location":2,"content":"that's what they stick with ya then the"},{"from":1419.28,"to":1422.13,"location":2,"content":"sort of there's the sort of point here"},{"from":1422.13,"to":1424.92,"location":2,"content":"that's just worth underlining is yeah so"},{"from":1424.92,"to":1427.83,"location":2,"content":"in principle we can back propagate this"},{"from":1427.83,"to":1431.66,"location":2,"content":"gradient to every variable in our model"},{"from":1431.66,"to":1435.75,"location":2,"content":"it's actually a theorem that we can"},{"from":1435.75,"to":1438.81,"location":2,"content":"arbitrarily decide to throw any subset"},{"from":1438.81,"to":1443.88,"location":2,"content":"of those gradients away and we're still"},{"from":1443.88,"to":1446.73,"location":2,"content":"improving the log likelihood of our"},{"from":1446.73,"to":1448.71,"location":2,"content":"model right it kind of can't be"},{"from":1448.71,"to":1450.99,"location":2,"content":"inconsistent you can just so pick some"},{"from":1450.99,"to":1453.6,"location":2,"content":"subset and say only train those 37 and"},{"from":1453.6,"to":1455.37,"location":2,"content":"throw away all the rest and the"},{"from":1455.37,"to":1457.95,"location":2,"content":"algorithm will still improve the log"},{"from":1457.95,"to":1459.75,"location":2,"content":"likelihood of the model perhaps not by"},{"from":1459.75,"to":1461.49,"location":2,"content":"as much as if you train the rest of the"},{"from":1461.49,"to":1464.58,"location":2,"content":"variables as well but yeah it can't"},{"from":1464.58,"to":1466.2,"location":2,"content":"actually do any harm not to train"},{"from":1466.2,"to":1468.75,"location":2,"content":"anything that's one of the reasons why"},{"from":1468.75,"to":1470.91,"location":2,"content":"often people don't notice bugs in their"},{"from":1470.91,"to":1472.62,"location":2,"content":"code as well is"},{"from":1472.62,"to":1474.39,"location":2,"content":"because if your code is kind of broken"},{"from":1474.39,"to":1476.73,"location":2,"content":"and only half of the variables are being"},{"from":1476.73,"to":1479.04,"location":2,"content":"updated it'll still seem to be training"},{"from":1479.04,"to":1481.35,"location":2,"content":"something and improving it's just not"},{"from":1481.35,"to":1483.36,"location":2,"content":"doing as well as it could be doing if"},{"from":1483.36,"to":1487.08,"location":2,"content":"you'd code it correctly okay"},{"from":1487.08,"to":1491.34,"location":2,"content":"so at this point that sort of almost"},{"from":1491.34,"to":1494.04,"location":2,"content":"shown you back propagation right so back"},{"from":1494.04,"to":1496.8,"location":2,"content":"propagation is really taking derivatives"},{"from":1496.8,"to":1499.35,"location":2,"content":"with a generalized chain role with the"},{"from":1499.35,"to":1501.63,"location":2,"content":"one further trick which we sort of"},{"from":1501.63,"to":1504.51,"location":2,"content":"represented with that Delta which is G"},{"from":1504.51,"to":1508.47,"location":2,"content":"you want to be clever in doing this so"},{"from":1508.47,"to":1510.81,"location":2,"content":"you minimize computation by reusing"},{"from":1510.81,"to":1514.77,"location":2,"content":"shared stuff but now what I want to move"},{"from":1514.77,"to":1516.51,"location":2,"content":"on is to sort of look at how we can do"},{"from":1516.51,"to":1519.21,"location":2,"content":"that much more systematically which is"},{"from":1519.21,"to":1521.37,"location":2,"content":"this idea we have a computation graph"},{"from":1521.37,"to":1522.75,"location":2,"content":"and we're going to run a back"},{"from":1522.75,"to":1524.31,"location":2,"content":"propagation algorithm through the"},{"from":1524.31,"to":1529.5,"location":2,"content":"computation graph so this is kind of"},{"from":1529.5,"to":1534.36,"location":2,"content":"like an abstract syntax tree expression"},{"from":1534.36,"to":1536.49,"location":2,"content":"tree that you might see in a compilers"},{"from":1536.49,"to":1538.56,"location":2,"content":"class or something like that right so"},{"from":1538.56,"to":1542.31,"location":2,"content":"when we have an arithmetic expression of"},{"from":1542.31,"to":1544.35,"location":2,"content":"the kind that we're going to compute we"},{"from":1544.35,"to":1546.57,"location":2,"content":"can make this tipped over on its side"},{"from":1546.57,"to":1549.51,"location":2,"content":"tree representation so we've got the X"},{"from":1549.51,"to":1551.88,"location":2,"content":"and W variables we're going to multiply"},{"from":1551.88,"to":1554.13,"location":2,"content":"them there's the B variable we're going"},{"from":1554.13,"to":1556.11,"location":2,"content":"to add it to the previous partial result"},{"from":1556.11,"to":1557.58,"location":2,"content":"we're going to stick it through our"},{"from":1557.58,"to":1559.89,"location":2,"content":"non-linearity F and then we're going to"},{"from":1559.89,"to":1561.87,"location":2,"content":"multiply it by U and that was the"},{"from":1561.87,"to":1564.18,"location":2,"content":"computation that we're doing in our"},{"from":1564.18,"to":1567.84,"location":2,"content":"neural network so the source nodes are"},{"from":1567.84,"to":1570.6,"location":2,"content":"inputs the interior nodes of this tree"},{"from":1570.6,"to":1573.42,"location":2,"content":"are operations and then we've got these"},{"from":1573.42,"to":1576.12,"location":2,"content":"edges that pass along the results of our"},{"from":1576.12,"to":1578.04,"location":2,"content":"computation and so this is the"},{"from":1578.04,"to":1580.11,"location":2,"content":"computation graph for precisely the"},{"from":1580.11,"to":1582.27,"location":2,"content":"example I've been doing for the last"},{"from":1582.27,"to":1585.96,"location":2,"content":"lecture and electron or Eilish okay so"},{"from":1585.96,"to":1587.64,"location":2,"content":"there are two things that we want to be"},{"from":1587.64,"to":1590.46,"location":2,"content":"able to do the first one is we want to"},{"from":1590.46,"to":1592.23,"location":2,"content":"be able to start with these variables"},{"from":1592.23,"to":1595.02,"location":2,"content":"and do this computation and calculate"},{"from":1595.02,"to":1597.54,"location":2,"content":"what s is that's the part that's dead"},{"from":1597.54,"to":1600.84,"location":2,"content":"simple that's referred to as forward"},{"from":1600.84,"to":1603.3,"location":2,"content":"propagation so forward propagation is"},{"from":1603.3,"to":1606.54,"location":2,"content":"just expression evaluation as you do"},{"from":1606.54,"to":1608.76,"location":2,"content":"in any programming language interpreter"},{"from":1608.76,"to":1612.72,"location":2,"content":"that's not hard at all but the"},{"from":1612.72,"to":1614.94,"location":2,"content":"difference here is hey we want to do a"},{"from":1614.94,"to":1617.25,"location":2,"content":"learning algorithm so we're going to do"},{"from":1617.25,"to":1619.56,"location":2,"content":"the opposite of that as well"},{"from":1619.56,"to":1622.35,"location":2,"content":"what we want to be able to do is also"},{"from":1622.35,"to":1625.02,"location":2,"content":"backward propagation or back propagation"},{"from":1625.02,"to":1627.06,"location":2,"content":"or just back prop it's commonly called"},{"from":1627.06,"to":1630.78,"location":2,"content":"which is we want to be able to go from"},{"from":1630.78,"to":1634.2,"location":2,"content":"the final part the final part here and"},{"from":1634.2,"to":1637.08,"location":2,"content":"then at each step we want to be"},{"from":1637.08,"to":1639.9,"location":2,"content":"calculating these partial derivatives"},{"from":1639.9,"to":1642.15,"location":2,"content":"and passing them back through the graph"},{"from":1642.15,"to":1644.61,"location":2,"content":"and so this was sort of the notion"},{"from":1644.61,"to":1647.04,"location":2,"content":"before that we had an error signal right"},{"from":1647.04,"to":1649.05,"location":2,"content":"so starting from up here we've"},{"from":1649.05,"to":1652.35,"location":2,"content":"calculated a partial of s by Z which is"},{"from":1652.35,"to":1655.47,"location":2,"content":"this with respect to that and so that's"},{"from":1655.47,"to":1657.87,"location":2,"content":"sort of our calculated error signal up"},{"from":1657.87,"to":1659.79,"location":2,"content":"to here and then we want to pass that"},{"from":1659.79,"to":1663.83,"location":2,"content":"further back to start computing our"},{"from":1663.83,"to":1666.72,"location":2,"content":"gradients further back right and we"},{"from":1666.72,"to":1670.29,"location":2,"content":"started off right here with the partial"},{"from":1670.29,"to":1673.05,"location":2,"content":"of s by s what's the partial of s but is"},{"from":1673.05,"to":1677.52,"location":2,"content":"going to be one okay yes so the rate at"},{"from":1677.52,"to":1679.32,"location":2,"content":"which s changes the rate at which s"},{"from":1679.32,"to":1681.69,"location":2,"content":"changes so we just start off with one"},{"from":1681.69,"to":1684.15,"location":2,"content":"and then we want to work out how this"},{"from":1684.15,"to":1689.91,"location":2,"content":"gradient changes as we go along so what"},{"from":1689.91,"to":1692.46,"location":2,"content":"we're doing here is when we're working"},{"from":1692.46,"to":1695.73,"location":2,"content":"out things for one node that a node is"},{"from":1695.73,"to":1697.44,"location":2,"content":"going to have passed into it it's"},{"from":1697.44,"to":1699.54,"location":2,"content":"upstream gradient which is its error"},{"from":1699.54,"to":1702.9,"location":2,"content":"signal so that's the partial of our"},{"from":1702.9,"to":1708.21,"location":2,"content":"final final result which was our loss by"},{"from":1708.21,"to":1711.54,"location":2,"content":"the variable that was the output of this"},{"from":1711.54,"to":1713.58,"location":2,"content":"computation node so that's the partial"},{"from":1713.58,"to":1717.75,"location":2,"content":"of s by H here and then we did some"},{"from":1717.75,"to":1720.36,"location":2,"content":"operation here here's the non-linearity"},{"from":1720.36,"to":1723.18,"location":2,"content":"but it might be something else and so"},{"from":1723.18,"to":1725.1,"location":2,"content":"what we wanted then work out is a"},{"from":1725.1,"to":1727.98,"location":2,"content":"downstream gradient which is the partial"},{"from":1727.98,"to":1730.8,"location":2,"content":"of s by Z which was the input of this"},{"from":1730.8,"to":1733.08,"location":2,"content":"function and well then the question is"},{"from":1733.08,"to":1736.74,"location":2,"content":"how do we do that and the answer that is"},{"from":1736.74,"to":1739.26,"location":2,"content":"we use the chain rule of course right so"},{"from":1739.26,"to":1740.12,"location":2,"content":"at"},{"from":1740.12,"to":1742.94,"location":2,"content":"we have a concept of a local gradient so"},{"from":1742.94,"to":1748.67,"location":2,"content":"here's H is the output z is the input so"},{"from":1748.67,"to":1750.62,"location":2,"content":"this function here and this is our"},{"from":1750.62,"to":1752.6,"location":2,"content":"non-linearity right so this is whatever"},{"from":1752.6,"to":1754.97,"location":2,"content":"we're using as our non-linearity like a"},{"from":1754.97,"to":1757.13,"location":2,"content":"logistic or at an age we're calculating"},{"from":1757.13,"to":1760.22,"location":2,"content":"H in terms of Z and we can work out the"},{"from":1760.22,"to":1762.35,"location":2,"content":"partial of H by Z so that's our local"},{"from":1762.35,"to":1765.26,"location":2,"content":"gradient and so then if we have both the"},{"from":1765.26,"to":1768.05,"location":2,"content":"upstream gradient and the local gradient"},{"from":1768.05,"to":1770.81,"location":2,"content":"we can then work out the downstream"},{"from":1770.81,"to":1773.48,"location":2,"content":"gradient because we know the partial of"},{"from":1773.48,"to":1778.4,"location":2,"content":"s by Z is going to be DSD a times D H DZ"},{"from":1778.4,"to":1781.07,"location":2,"content":"and so then we'll be able to pass down"},{"from":1781.07,"to":1784.78,"location":2,"content":"the downstream gradient to the next node"},{"from":1784.78,"to":1788.78,"location":2,"content":"ok so our basic rule which is just the"},{"from":1788.78,"to":1792.25,"location":2,"content":"chain rule written in different terms is"},{"from":1792.25,"to":1795.38,"location":2,"content":"downstream gradient equals upstream"},{"from":1795.38,"to":1799.01,"location":2,"content":"gradient times local gradient easy as"},{"from":1799.01,"to":1804.95,"location":2,"content":"that ok so this was the very simplest"},{"from":1804.95,"to":1807.65,"location":2,"content":"case where we have a node with one input"},{"from":1807.65,"to":1811.34,"location":2,"content":"and one output so that's a function like"},{"from":1811.34,"to":1813.83,"location":2,"content":"our logistic function but we also want"},{"from":1813.83,"to":1815.48,"location":2,"content":"to have things work out for a general"},{"from":1815.48,"to":1817.76,"location":2,"content":"computation graph so how are we going to"},{"from":1817.76,"to":1821.84,"location":2,"content":"do that well the next case is what about"},{"from":1821.84,"to":1824.66,"location":2,"content":"if we have multiple inputs so if we're"},{"from":1824.66,"to":1828.62,"location":2,"content":"calculating something like Z equals W"},{"from":1828.62,"to":1832.94,"location":2,"content":"times X we're actually yes Z and X of"},{"from":1832.94,"to":1838.7,"location":2,"content":"themselves vectors and W is a matrix but"},{"from":1838.7,"to":1841.55,"location":2,"content":"we're treating X as only important W as"},{"from":1841.55,"to":1844.82,"location":2,"content":"an input and Z is our output right we"},{"from":1844.82,"to":1846.02,"location":2,"content":"kind of group vectors and matrices"},{"from":1846.02,"to":1849.89,"location":2,"content":"together well if you have multiple"},{"from":1849.89,"to":1852.95,"location":2,"content":"inputs you then end up with multiple"},{"from":1852.95,"to":1855.77,"location":2,"content":"local gradients so you can work out the"},{"from":1855.77,"to":1858.26,"location":2,"content":"partial of Z with respect to X or the"},{"from":1858.26,"to":1861.26,"location":2,"content":"partial of Z u with respect to W and so"},{"from":1861.26,"to":1864.29,"location":2,"content":"you essentially you take the upstream"},{"from":1864.29,"to":1867.8,"location":2,"content":"gradient you multiply it by each of the"},{"from":1867.8,"to":1870.8,"location":2,"content":"local gradients and you pass it down the"},{"from":1870.8,"to":1873.47,"location":2,"content":"respective path and we calculate these"},{"from":1873.47,"to":1874.04,"location":2,"content":"differ"},{"from":1874.04,"to":1877.33,"location":2,"content":"and downstream gradients to pass along"},{"from":1877.33,"to":1881.26,"location":2,"content":"that making sense"},{"from":1881.26,"to":1885.8,"location":2,"content":"yeah okay I'll chug"},{"from":1885.8,"to":1889.43,"location":2,"content":"okay so let's sort of look at an example"},{"from":1889.43,"to":1891.44,"location":2,"content":"of this and then we'll see one another"},{"from":1891.44,"to":1894.02,"location":2,"content":"case so here's a little baby example"},{"from":1894.02,"to":1896.18,"location":2,"content":"this isn't kind of really looking like a"},{"from":1896.18,"to":1899.78,"location":2,"content":"neural net but we've got three imports x"},{"from":1899.78,"to":1902.81,"location":2,"content":"y and z and x and y get added together"},{"from":1902.81,"to":1906.68,"location":2,"content":"and y and z get maxed and then we take"},{"from":1906.68,"to":1908.96,"location":2,"content":"the results of those two operations and"},{"from":1908.96,"to":1911.6,"location":2,"content":"we multiply them together so overall"},{"from":1911.6,"to":1914.63,"location":2,"content":"what we're calculating is X plus y times"},{"from":1914.63,"to":1918.56,"location":2,"content":"the max of y plus C but you know we have"},{"from":1918.56,"to":1921.89,"location":2,"content":"here a general technique and we can"},{"from":1921.89,"to":1926.81,"location":2,"content":"apply it in any cases okay so if we want"},{"from":1926.81,"to":1928.76,"location":2,"content":"to have this graph and we want to run it"},{"from":1928.76,"to":1931.85,"location":2,"content":"forward well we need to know the values"},{"from":1931.85,"to":1935.42,"location":2,"content":"of x y and z so for my example x equals"},{"from":1935.42,"to":1941.75,"location":2,"content":"1 y equals to z equals 0 so we take the"},{"from":1941.75,"to":1944.18,"location":2,"content":"values of those variables and push them"},{"from":1944.18,"to":1947.24,"location":2,"content":"on to the calculations for the forward"},{"from":1947.24,"to":1950.21,"location":2,"content":"arrows and then well the first thing we"},{"from":1950.21,"to":1952.01,"location":2,"content":"do is add and the result of that is"},{"from":1952.01,"to":1953.93,"location":2,"content":"three and so we can put that onto the"},{"from":1953.93,"to":1956.45,"location":2,"content":"arrow that's the output of ad max it's 2"},{"from":1956.45,"to":1959.27,"location":2,"content":"is the output of the value of air x is 6"},{"from":1959.27,"to":1961.46,"location":2,"content":"and so the forward pass we have"},{"from":1961.46,"to":1964.52,"location":2,"content":"evaluated the expression it's value is 6"},{"from":1964.52,"to":1967.4,"location":2,"content":"that wasn't hard ok so then the next"},{"from":1967.4,"to":1970.73,"location":2,"content":"step is we then want to run that"},{"from":1970.73,"to":1975.71,"location":2,"content":"propagation to work out gradients and so"},{"from":1975.71,"to":1979.93,"location":2,"content":"we sort of want to know how to sort of"},{"from":1979.93,"to":1987.11,"location":2,"content":"work out these local gradients so a is"},{"from":1987.11,"to":1990.68,"location":2,"content":"out right a is the result of some so"},{"from":1990.68,"to":1992.86,"location":2,"content":"here's a as the result of some so a"},{"from":1992.86,"to":1996.47,"location":2,"content":"equals x plus y so if you're taking da"},{"from":1996.47,"to":2001.66,"location":2,"content":"DX that's just 1 and da dy is also 1"},{"from":2001.66,"to":2003.33,"location":2,"content":"that makes sense"},{"from":2003.33,"to":2007.33,"location":2,"content":"the max is slightly trickier because"},{"from":2007.33,"to":2010.96,"location":2,"content":"where there's some slope some gradient"},{"from":2010.96,"to":2012.85,"location":2,"content":"for the max depends on which one's"},{"from":2012.85,"to":2017.52,"location":2,"content":"bigger so if y is bigger than Z D Delta"},{"from":2017.52,"to":2022.06,"location":2,"content":"the partial of B by Z partial B by Y is"},{"from":2022.06,"to":2027.31,"location":2,"content":"1 otherwise at 0 and conversely for the"},{"from":2027.31,"to":2030.46,"location":2,"content":"partial of B by Z so now that one's a"},{"from":2030.46,"to":2034.21,"location":2,"content":"little bit dependent and then we do the"},{"from":2034.21,"to":2039.49,"location":2,"content":"multiplication case at the end and work"},{"from":2039.49,"to":2043.66,"location":2,"content":"out its partials with respect to a and B"},{"from":2043.66,"to":2047.71,"location":2,"content":"and since that's a B which has the"},{"from":2047.71,"to":2050.44,"location":2,"content":"values 2 and 3 if you're taking the"},{"from":2050.44,"to":2052.69,"location":2,"content":"partial of F by a it equals B which is 2"},{"from":2052.69,"to":2056.11,"location":2,"content":"and vice versa okay so that means we can"},{"from":2056.11,"to":2058.63,"location":2,"content":"work out the local gradients at each"},{"from":2058.63,"to":2062.65,"location":2,"content":"node and so then we want to use those to"},{"from":2062.65,"to":2064.96,"location":2,"content":"calculate our gradients backwards in the"},{"from":2064.96,"to":2067.42,"location":2,"content":"back propagation path so we start at the"},{"from":2067.42,"to":2069.85,"location":2,"content":"top the partial of F with respect to F"},{"from":2069.85,"to":2074.05,"location":2,"content":"is 1 because if you move if you know by"},{"from":2074.05,"to":2077.5,"location":2,"content":"1/10 then you've moved the F by 1/10 so"},{"from":2077.5,"to":2080.26,"location":2,"content":"that's a Counsel's out as 1 okay so then"},{"from":2080.26,"to":2083.77,"location":2,"content":"we want to pass backwards so the first"},{"from":2083.77,"to":2085.5,"location":2,"content":"thing that we have is this sort of"},{"from":2085.5,"to":2088.33,"location":2,"content":"multiply node and so we work we know"},{"from":2088.33,"to":2090.64,"location":2,"content":"it's local gradients the partial of F by"},{"from":2090.64,"to":2096.45,"location":2,"content":"a is 2 and the partial of F by B is 3"},{"from":2096.45,"to":2100.06,"location":2,"content":"and so we get those values so formally"},{"from":2100.06,"to":2103.15,"location":2,"content":"we're taking the local gradients"},{"from":2103.15,"to":2104.58,"location":2,"content":"multiplying them by the upstream"},{"from":2104.58,"to":2107.62,"location":2,"content":"gradients and getting our 3 and 2 and"},{"from":2107.62,"to":2109.81,"location":2,"content":"notice the fact that sort of effectively"},{"from":2109.81,"to":2111.4,"location":2,"content":"what happened is the values on the two"},{"from":2111.4,"to":2114.79,"location":2,"content":"arcs swaps and then we sort of continue"},{"from":2114.79,"to":2117.88,"location":2,"content":"back okay there's a max node so our"},{"from":2117.88,"to":2120.61,"location":2,"content":"upstream gradient is now 3 and then we"},{"from":2120.61,"to":2123.25,"location":2,"content":"want to multiply by the local gradient"},{"from":2123.25,"to":2126.7,"location":2,"content":"and since the max of these two is 2"},{"from":2126.7,"to":2130.24,"location":2,"content":"there's a slope of 1 on this side so we"},{"from":2130.24,"to":2132.91,"location":2,"content":"get 3 there's no gradient on this side"},{"from":2132.91,"to":2136.05,"location":2,"content":"and we get 0 and then we do the similar"},{"from":2136.05,"to":2139.33,"location":2,"content":"calculation on the other side where we"},{"from":2139.33,"to":2141.04,"location":2,"content":"have local gradients of 1"},{"from":2141.04,"to":2144.22,"location":2,"content":"and so both of them come out of two and"},{"from":2144.22,"to":2146.89,"location":2,"content":"then the one other thing to do is we"},{"from":2146.89,"to":2148.87,"location":2,"content":"notice well wait a minute"},{"from":2148.87,"to":2151.39,"location":2,"content":"there are two arcs that started from the"},{"from":2151.39,"to":2154.66,"location":2,"content":"Y both of which we've back propagated"},{"from":2154.66,"to":2157.57,"location":2,"content":"some gradient on and so what do we do"},{"from":2157.57,"to":2161.08,"location":2,"content":"about that what we do about that is we"},{"from":2161.08,"to":2165.1,"location":2,"content":"sum so the partial of F by X is 2 the"},{"from":2165.1,"to":2167.98,"location":2,"content":"partial of F by Z is 0 but the partial"},{"from":2167.98,"to":2172.32,"location":2,"content":"of F by Y is the sum of the two and five"},{"from":2172.32,"to":2175.24,"location":2,"content":"right and so this isn't complete voodoo"},{"from":2175.24,"to":2178.38,"location":2,"content":"this is something that should make sense"},{"from":2178.38,"to":2182.02,"location":2,"content":"in terms of what gradients are right so"},{"from":2182.02,"to":2184.48,"location":2,"content":"that what we're saying is what we're"},{"from":2184.48,"to":2187.24,"location":2,"content":"calculating is if you wiggle X a little"},{"from":2187.24,"to":2190.15,"location":2,"content":"bit how big an effect does that have on"},{"from":2190.15,"to":2192.64,"location":2,"content":"the outcome of the whole thing and so"},{"from":2192.64,"to":2194.17,"location":2,"content":"you know we should be able to work this"},{"from":2194.17,"to":2197.59,"location":2,"content":"out so our X started off as one but"},{"from":2197.59,"to":2200.14,"location":2,"content":"let's suppose we wiggle it up a bit to"},{"from":2200.14,"to":2203.35,"location":2,"content":"make it one point one well according to"},{"from":2203.35,"to":2206.41,"location":2,"content":"this our output should change by about"},{"from":2206.41,"to":2208.81,"location":2,"content":"zero point two it should be magnified by"},{"from":2208.81,"to":2210.76,"location":2,"content":"two and we should be able to work that"},{"from":2210.76,"to":2213.66,"location":2,"content":"out right so it's then one point one"},{"from":2213.66,"to":2217.69,"location":2,"content":"plus two so that's then three point one"},{"from":2217.69,"to":2220.66,"location":2,"content":"and then we've got the two here that"},{"from":2220.66,"to":2223.33,"location":2,"content":"multiplies by it and it's six point two"},{"from":2223.33,"to":2225.4,"location":2,"content":"and lo and behold it went up by point to"},{"from":2225.4,"to":2227.83,"location":2,"content":"right so that seems correct and if we"},{"from":2227.83,"to":2230.68,"location":2,"content":"try and do the same for well let's do"},{"from":2230.68,"to":2232.96,"location":2,"content":"the Z it's easy so if we wiggle the Z"},{"from":2232.96,"to":2235.96,"location":2,"content":"which had a value of the zero by 0.1"},{"from":2235.96,"to":2239.56,"location":2,"content":"this is zero point one when we max it if"},{"from":2239.56,"to":2242.2,"location":2,"content":"this is still two and so a calculated"},{"from":2242.2,"to":2245.11,"location":2,"content":"value doesn't change it's still six so"},{"from":2245.11,"to":2247.36,"location":2,"content":"the gradient here is zero we doing this"},{"from":2247.36,"to":2251.49,"location":2,"content":"does nothing and then the final one is y"},{"from":2251.49,"to":2255.7,"location":2,"content":"so it's starting off value as two so if"},{"from":2255.7,"to":2257.35,"location":2,"content":"we wiggle it a little and make a two"},{"from":2257.35,"to":2261.52,"location":2,"content":"point one our claim is that the results"},{"from":2261.52,"to":2264.61,"location":2,"content":"sort of change by about 0.5 it should be"},{"from":2264.61,"to":2267.34,"location":2,"content":"multiplied by five times so if we make"},{"from":2267.34,"to":2270.49,"location":2,"content":"this two point one we then have two"},{"from":2270.49,"to":2273.49,"location":2,"content":"point one plus one would be three point"},{"from":2273.49,"to":2274.32,"location":2,"content":"one"},{"from":2274.32,"to":2278.67,"location":2,"content":"when we get the max here it also be 2.1"},{"from":2278.67,"to":2281.91,"location":2,"content":"and so it have to point one times three"},{"from":2281.91,"to":2284.46,"location":2,"content":"point one and that's to harder rithmetic"},{"from":2284.46,"to":2288.86,"location":2,"content":"for me to do in my head that if we take"},{"from":2288.86,"to":2294,"location":2,"content":"two point one times three point one it"},{"from":2294,"to":2296.79,"location":2,"content":"comes out of six point five one so"},{"from":2296.79,"to":2299.34,"location":2,"content":"basically it's gone up by half right if"},{"from":2299.34,"to":2301.53,"location":2,"content":"we don't expect the answers to be exact"},{"from":2301.53,"to":2303.24,"location":2,"content":"of course right because you know that's"},{"from":2303.24,"to":2305.07,"location":2,"content":"not the way calculus works right well"},{"from":2305.07,"to":2307.74,"location":2,"content":"I'm just that that it's showing that"},{"from":2307.74,"to":2310.2,"location":2,"content":"we're getting the gradients right okay"},{"from":2310.2,"to":2313.65,"location":2,"content":"so this actually works so what are the"},{"from":2313.65,"to":2317.76,"location":2,"content":"techniques that we need to know so we've"},{"from":2317.76,"to":2319.71,"location":2,"content":"sort of already seen them all so you"},{"from":2319.71,"to":2322.38,"location":2,"content":"know we discussed when there are"},{"from":2322.38,"to":2324.75,"location":2,"content":"multiple incoming arcs how he saw work"},{"from":2324.75,"to":2327.48,"location":2,"content":"out the different local derivatives"},{"from":2327.48,"to":2329.91,"location":2,"content":"their main other case that we need to"},{"from":2329.91,"to":2333.9,"location":2,"content":"know is if in the function computation"},{"from":2333.9,"to":2336.42,"location":2,"content":"there's a branch outward the result of"},{"from":2336.42,"to":2339.12,"location":2,"content":"something is used in multiple places and"},{"from":2339.12,"to":2341.37,"location":2,"content":"so this was like the case here I mean"},{"from":2341.37,"to":2343.65,"location":2,"content":"here this was an initial variable but"},{"from":2343.65,"to":2345.09,"location":2,"content":"you know it could have been computed by"},{"from":2345.09,"to":2346.95,"location":2,"content":"something further back so if this thing"},{"from":2346.95,"to":2350.37,"location":2,"content":"is used in multiple places and you have"},{"from":2350.37,"to":2353.1,"location":2,"content":"the computation going out in different"},{"from":2353.1,"to":2356.19,"location":2,"content":"ways it's just this simple rule that"},{"from":2356.19,"to":2358.17,"location":2,"content":"when you do back propagation backwards"},{"from":2358.17,"to":2361.11,"location":2,"content":"use some the gradients that you get from"},{"from":2361.11,"to":2364.11,"location":2,"content":"the different outward branches okay so"},{"from":2364.11,"to":2367.77,"location":2,"content":"if a equals x plus y and well that's one"},{"from":2367.77,"to":2369.48,"location":2,"content":"we showed you before that we're doing"},{"from":2369.48,"to":2372.15,"location":2,"content":"this sum operation to work out the total"},{"from":2372.15,"to":2377.1,"location":2,"content":"partial of F by Y okay and if you sort"},{"from":2377.1,"to":2380.16,"location":2,"content":"of think about it just a little bit more"},{"from":2380.16,"to":2383,"location":2,"content":"there are sort of these obvious patterns"},{"from":2383,"to":2386.55,"location":2,"content":"which we saw in this very simple example"},{"from":2386.55,"to":2391.8,"location":2,"content":"so if you've got a plus that really the"},{"from":2391.8,"to":2394.8,"location":2,"content":"upstream gradient is going to be sort of"},{"from":2394.8,"to":2397.44,"location":2,"content":"heading down every one of these grant"},{"from":2397.44,"to":2400.38,"location":2,"content":"branches when you have multiple branches"},{"from":2400.38,"to":2403.32,"location":2,"content":"or things being summed now in this case"},{"from":2403.32,"to":2406.74,"location":2,"content":"it just is copied unchanged but that's"},{"from":2406.74,"to":2408.09,"location":2,"content":"because our comp"},{"from":2408.09,"to":2410.73,"location":2,"content":"tation was x plus y you know it could be"},{"from":2410.73,"to":2412.65,"location":2,"content":"more complicated but we're passing it"},{"from":2412.65,"to":2415.41,"location":2,"content":"down down each of those branches so plus"},{"from":2415.41,"to":2420.03,"location":2,"content":"distributes upstream gradient when you"},{"from":2420.03,"to":2422.22,"location":2,"content":"have a max that's kind of like a"},{"from":2422.22,"to":2425.1,"location":2,"content":"ralphing operation because max is going"},{"from":2425.1,"to":2427.71,"location":2,"content":"to be sending the gradient to in the"},{"from":2427.71,"to":2429.87,"location":2,"content":"direction that's the max and other"},{"from":2429.87,"to":2431.52,"location":2,"content":"things are going to get no gradient"},{"from":2431.52,"to":2434.82,"location":2,"content":"being passed down to them and then when"},{"from":2434.82,"to":2438.51,"location":2,"content":"you have a multiplication this has this"},{"from":2438.51,"to":2440.94,"location":2,"content":"kind of fun effect that what you do is"},{"from":2440.94,"to":2442.98,"location":2,"content":"switch the gradient ride and so this"},{"from":2442.98,"to":2445.41,"location":2,"content":"reflects the fact that when you have u"},{"from":2445.41,"to":2449.37,"location":2,"content":"times V regardless of whether u and V"},{"from":2449.37,"to":2452.97,"location":2,"content":"are vectors or just scalars that the"},{"from":2452.97,"to":2455.1,"location":2,"content":"derivative of the result with respect to"},{"from":2455.1,"to":2457.41,"location":2,"content":"u is V and the derivative of a spot"},{"from":2457.41,"to":2460.82,"location":2,"content":"result with respect to V is U and so the"},{"from":2460.82,"to":2465.45,"location":2,"content":"gradient signal is the flip of the two"},{"from":2465.45,"to":2469.95,"location":2,"content":"numbers on the different sides okay so"},{"from":2469.95,"to":2474,"location":2,"content":"this is sort of most of how we have"},{"from":2474,"to":2477.27,"location":2,"content":"these computation graphs and we can work"},{"from":2477.27,"to":2479.43,"location":2,"content":"out back propagation backwards in them"},{"from":2479.43,"to":2482.67,"location":2,"content":"there's sort of one more part of this to"},{"from":2482.67,"to":2486.39,"location":2,"content":"do which is to say gee we want to do"},{"from":2486.39,"to":2489.81,"location":2,"content":"this efficiently so there's a bad way to"},{"from":2489.81,"to":2492,"location":2,"content":"do this which is to say oh well we"},{"from":2492,"to":2494.25,"location":2,"content":"wanted to calculate the partial vez by B"},{"from":2494.25,"to":2497.19,"location":2,"content":"and so we can calculate that part roll"},{"from":2497.19,"to":2499.17,"location":2,"content":"which was essentially what I was doing"},{"from":2499.17,"to":2504,"location":2,"content":"on last time slides we say partial of F"},{"from":2504,"to":2508.74,"location":2,"content":"by B equals the partial of s by H times"},{"from":2508.74,"to":2511.71,"location":2,"content":"the partial of H by Z times the partial"},{"from":2511.71,"to":2514.35,"location":2,"content":"of Z by B and we have all of those"},{"from":2514.35,"to":2516.15,"location":2,"content":"partials we work them all out and"},{"from":2516.15,"to":2518.88,"location":2,"content":"multiply them together and then someone"},{"from":2518.88,"to":2522.75,"location":2,"content":"says what's the partial of s by W and we"},{"from":2522.75,"to":2524.31,"location":2,"content":"say huh that's the chain rule again I'll"},{"from":2524.31,"to":2528.32,"location":2,"content":"do it all again it's the partial of F by"},{"from":2528.32,"to":2531.87,"location":2,"content":"H times the partial of H by Z times the"},{"from":2531.87,"to":2539.73,"location":2,"content":"partial of Z by X no mo right lost it"},{"from":2539.73,"to":2541.89,"location":2,"content":"but you do a big long list of them"},{"from":2541.89,"to":2543.6,"location":2,"content":"and you calculate it all again that's"},{"from":2543.6,"to":2546.12,"location":2,"content":"not what we want to do instead we want"},{"from":2546.12,"to":2548.52,"location":2,"content":"to say I look there's this shared stuff"},{"from":2548.52,"to":2550.92,"location":2,"content":"there's this error signal coming from"},{"from":2550.92,"to":2554.25,"location":2,"content":"above and we can work out the error"},{"from":2554.25,"to":2556.35,"location":2,"content":"signal the upstream gradient for this"},{"from":2556.35,"to":2558.93,"location":2,"content":"node we can use it to calculate the"},{"from":2558.93,"to":2561.21,"location":2,"content":"upstream gradient for this node we can"},{"from":2561.21,"to":2563.19,"location":2,"content":"use this to calculate the upstream"},{"from":2563.19,"to":2566.4,"location":2,"content":"gradient for this node and then using"},{"from":2566.4,"to":2568.44,"location":2,"content":"the local gradients of which they're too"},{"from":2568.44,"to":2570.75,"location":2,"content":"calculated this node we can then"},{"from":2570.75,"to":2575.19,"location":2,"content":"calculate this one and that one and then"},{"from":2575.19,"to":2579.09,"location":2,"content":"from here having knowing this upstream"},{"from":2579.09,"to":2581.31,"location":2,"content":"gradient we can use the local gradients"},{"from":2581.31,"to":2583.77,"location":2,"content":"at this node to compute this one and"},{"from":2583.77,"to":2586.92,"location":2,"content":"that one and so we're sort of doing this"},{"from":2586.92,"to":2589.05,"location":2,"content":"efficient computer science like"},{"from":2589.05,"to":2591.84,"location":2,"content":"computation where we don't do any"},{"from":2591.84,"to":2595.53,"location":2,"content":"repeated work that makes sense yeah okay"},{"from":2595.53,"to":2600.75,"location":2,"content":"and so if that is the whole of backprop"},{"from":2600.75,"to":2604.91,"location":2,"content":"so here's sort of a slightly sketchy"},{"from":2604.91,"to":2607.31,"location":2,"content":"graph which is sort of just"},{"from":2607.31,"to":2610.59,"location":2,"content":"recapitulating this thing so if you have"},{"from":2610.59,"to":2615.35,"location":2,"content":"any computation that you want to perform"},{"from":2615.35,"to":2620.94,"location":2,"content":"well the hope is that you can sort your"},{"from":2620.94,"to":2623.7,"location":2,"content":"nodes into what's called a topological"},{"from":2623.7,"to":2627.12,"location":2,"content":"sort which means the things that are"},{"from":2627.12,"to":2629.67,"location":2,"content":"arguments variables that are arguments"},{"from":2629.67,"to":2632.25,"location":2,"content":"are sorted before variables that are"},{"from":2632.25,"to":2634.98,"location":2,"content":"results that depend on that argument you"},{"from":2634.98,"to":2636.45,"location":2,"content":"know providing you have something"},{"from":2636.45,"to":2638.37,"location":2,"content":"there's an acyclic graph you'll be able"},{"from":2638.37,"to":2640.98,"location":2,"content":"to do that if you have a psychic graph"},{"from":2640.98,"to":2642.26,"location":2,"content":"you're in trouble"},{"from":2642.26,"to":2644.73,"location":2,"content":"well I mean they're actually techniques"},{"from":2644.73,"to":2646.5,"location":2,"content":"people use to roll out those graphs but"},{"from":2646.5,"to":2647.82,"location":2,"content":"I'm not going to go into that now"},{"from":2647.82,"to":2650.19,"location":2,"content":"so we've sorted the nodes which is kind"},{"from":2650.19,"to":2652.5,"location":2,"content":"of loosely represented here from bottom"},{"from":2652.5,"to":2652.98,"location":2,"content":"to top"},{"from":2652.98,"to":2657.09,"location":2,"content":"in a topological sort area sort ok so"},{"from":2657.09,"to":2659.85,"location":2,"content":"then for the forward prop we sort of go"},{"from":2659.85,"to":2661.74,"location":2,"content":"through the nodes and they're"},{"from":2661.74,"to":2666.36,"location":2,"content":"topological sort order and we if it's a"},{"from":2666.36,"to":2668.34,"location":2,"content":"variable we just said its value to what"},{"from":2668.34,"to":2670.74,"location":2,"content":"it's very about variable value is if"},{"from":2670.74,"to":2673.05,"location":2,"content":"it's computed from other variables their"},{"from":2673.05,"to":2675.34,"location":2,"content":"values must have been set already"},{"from":2675.34,"to":2676.99,"location":2,"content":"because there earlier in the topological"},{"from":2676.99,"to":2680.26,"location":2,"content":"sort and then we compute the value of"},{"from":2680.26,"to":2682.54,"location":2,"content":"those nodes according to their"},{"from":2682.54,"to":2685.57,"location":2,"content":"predecessors and we pass it up and work"},{"from":2685.57,"to":2687.94,"location":2,"content":"out the final output the loss function"},{"from":2687.94,"to":2690.31,"location":2,"content":"of our neural network and that is our"},{"from":2690.31,"to":2693.34,"location":2,"content":"forward pass okay so then after that we"},{"from":2693.34,"to":2695.11,"location":2,"content":"do our backward pass and so for the"},{"from":2695.11,"to":2698.89,"location":2,"content":"backward pass we initialize the output"},{"from":2698.89,"to":2700.93,"location":2,"content":"gradient with one the top thing is"},{"from":2700.93,"to":2703.06,"location":2,"content":"always one the partial of Z with respect"},{"from":2703.06,"to":2706.69,"location":2,"content":"to Z and then we now sort of go through"},{"from":2706.69,"to":2709.48,"location":2,"content":"the nodes in Reverse topological sort"},{"from":2709.48,"to":2712.6,"location":2,"content":"and so therefore each of them will all"},{"from":2712.6,"to":2716.16,"location":2,"content":"read anything that's anything that's"},{"from":2716.16,"to":2718.39,"location":2,"content":"playing with its complex and I think"},{"from":2718.39,"to":2720.19,"location":2,"content":"it's above it everything that we"},{"from":2720.19,"to":2722.25,"location":2,"content":"calculated based on it in terms of"},{"from":2722.25,"to":2725.16,"location":2,"content":"forward pass will already have had"},{"from":2725.16,"to":2730.51,"location":2,"content":"calculated its its gradient as a product"},{"from":2730.51,"to":2732.55,"location":2,"content":"of upstream gradient times local"},{"from":2732.55,"to":2735.75,"location":2,"content":"gradient and then we can use that to"},{"from":2735.75,"to":2739.72,"location":2,"content":"compute the next thing down and so"},{"from":2739.72,"to":2743.23,"location":2,"content":"basically that what the overall role is"},{"from":2743.23,"to":2746.71,"location":2,"content":"for any node you work out its set of"},{"from":2746.71,"to":2749.02,"location":2,"content":"successes the things that are above it"},{"from":2749.02,"to":2751.66,"location":2,"content":"that it that depend on it and then you"},{"from":2751.66,"to":2754.24,"location":2,"content":"say okay the partial of Z with respect"},{"from":2754.24,"to":2759.04,"location":2,"content":"to X is simply the sum over the set of"},{"from":2759.04,"to":2763,"location":2,"content":"successes of the local gradient that you"},{"from":2763,"to":2766.27,"location":2,"content":"calculate at the node times the upstream"},{"from":2766.27,"to":2769.21,"location":2,"content":"gradient of that node and in the"},{"from":2769.21,"to":2771.67,"location":2,"content":"examples that I gave before there was"},{"from":2771.67,"to":2774.88,"location":2,"content":"never never multiple upstream gradients"},{"from":2774.88,"to":2777.16,"location":2,"content":"but if you imagine a a general big graph"},{"from":2777.16,"to":2779.11,"location":2,"content":"they could actually be sort of different"},{"from":2779.11,"to":2780.85,"location":2,"content":"upstream gradients that are being used"},{"from":2780.85,"to":2785.68,"location":2,"content":"in for the various successes so we apply"},{"from":2785.68,"to":2788.65,"location":2,"content":"that backwards and then we've worked out"},{"from":2788.65,"to":2792.46,"location":2,"content":"in back propagation the gradient of"},{"from":2792.46,"to":2796.18,"location":2,"content":"every the gradient of the final results"},{"from":2796.18,"to":2798.16,"location":2,"content":"Z with respect to every node in our"},{"from":2798.16,"to":2801.67,"location":2,"content":"graph and the thing to notice about this"},{"from":2801.67,"to":2804.37,"location":2,"content":"is if you're doing it right and"},{"from":2804.37,"to":2806.92,"location":2,"content":"efficiently the big o order of"},{"from":2806.92,"to":2809.37,"location":2,"content":"complexity of doing that proper"},{"from":2809.37,"to":2812.55,"location":2,"content":"is exactly the same as doing forward"},{"from":2812.55,"to":2815.58,"location":2,"content":"propagation eye expression evaluation so"},{"from":2815.58,"to":2818.88,"location":2,"content":"it's not some super expensive complex"},{"from":2818.88,"to":2820.35,"location":2,"content":"procedure that you couldn't imagine"},{"from":2820.35,"to":2824.12,"location":2,"content":"doing and scaling up you're actually in"},{"from":2824.12,"to":2827.97,"location":2,"content":"exactly the same complexity order okay"},{"from":2827.97,"to":2831.39,"location":2,"content":"so as I presented here this procedure"},{"from":2831.39,"to":2834.33,"location":2,"content":"you could just think of something that"},{"from":2834.33,"to":2838.4,"location":2,"content":"you're running on an arbitrary graph and"},{"from":2838.4,"to":2840.66,"location":2,"content":"calculating this forward pass and the"},{"from":2840.66,"to":2843.24,"location":2,"content":"backwards pass I mean almost without"},{"from":2843.24,"to":2845.52,"location":2,"content":"exception that the kind of neural nets"},{"from":2845.52,"to":2848.19,"location":2,"content":"that we actually use have a regular"},{"from":2848.19,"to":2850.47,"location":2,"content":"layer like structure and that's then"},{"from":2850.47,"to":2853.26,"location":2,"content":"precisely why it makes sense to work out"},{"from":2853.26,"to":2856.65,"location":2,"content":"these gradients in terms of vectors"},{"from":2856.65,"to":2859.02,"location":2,"content":"matrices and jacobians as the kind we"},{"from":2859.02,"to":2862.11,"location":2,"content":"were before okay"},{"from":2862.11,"to":2864.9,"location":2,"content":"so since we have this sort of really"},{"from":2864.9,"to":2867.99,"location":2,"content":"nice algorithm now this sort of means"},{"from":2867.99,"to":2871.23,"location":2,"content":"that we can do this just computationally"},{"from":2871.23,"to":2873.75,"location":2,"content":"and so we don't have to think or know"},{"from":2873.75,"to":2876.66,"location":2,"content":"how to do math and we can just have our"},{"from":2876.66,"to":2880.56,"location":2,"content":"computers do all of us with this so that"},{"from":2880.56,"to":2884.12,"location":2,"content":"using this graph structure we can just"},{"from":2884.12,"to":2888.11,"location":2,"content":"automatically work out how to apply"},{"from":2888.11,"to":2890.79,"location":2,"content":"backprop and there's sort of two cases"},{"from":2890.79,"to":2894.72,"location":2,"content":"of this right so if what was calculated"},{"from":2894.72,"to":2898.62,"location":2,"content":"at each node is given as a symbolic"},{"from":2898.62,"to":2901.68,"location":2,"content":"expression we could actually have our"},{"from":2901.68,"to":2905.45,"location":2,"content":"computer work out for us what the"},{"from":2905.45,"to":2908.1,"location":2,"content":"derivative of that symbolic expression"},{"from":2908.1,"to":2910.83,"location":2,"content":"is so it could actually calculate the"},{"from":2910.83,"to":2912.78,"location":2,"content":"gradient of that node and that's"},{"from":2912.78,"to":2915.33,"location":2,"content":"referred to as often as automatic"},{"from":2915.33,"to":2917.84,"location":2,"content":"differentiation so this is kind of like"},{"from":2917.84,"to":2920.07,"location":2,"content":"Mathematica Wolfram Alpha if you know"},{"from":2920.07,"to":2921.99,"location":2,"content":"how you do your math homework on it you"},{"from":2921.99,"to":2923.55,"location":2,"content":"just type in your expression say what's"},{"from":2923.55,"to":2925.35,"location":2,"content":"the derivative and it gives it back to"},{"from":2925.35,"to":2928.23,"location":2,"content":"you right it's working doing symbolic"},{"from":2928.23,"to":2930.24,"location":2,"content":"computation and working out the"},{"from":2930.24,"to":2933.54,"location":2,"content":"derivative for you so that so that"},{"from":2933.54,"to":2935.22,"location":2,"content":"method could be used to work out the"},{"from":2935.22,"to":2938.04,"location":2,"content":"local gradients and then we can use the"},{"from":2938.04,"to":2941.55,"location":2,"content":"graph structure and our rule upstream"},{"from":2941.55,"to":2942.57,"location":2,"content":"gradient times"},{"from":2942.57,"to":2944.61,"location":2,"content":"local gradient gives downstream gradient"},{"from":2944.61,"to":2947.76,"location":2,"content":"ie the chain rule to then propagate it"},{"from":2947.76,"to":2949.41,"location":2,"content":"through the graph and do the whole"},{"from":2949.41,"to":2952.79,"location":2,"content":"backward pass completely automatically"},{"from":2952.79,"to":2957.72,"location":2,"content":"and so that sounds great"},{"from":2957.72,"to":2961.23,"location":2,"content":"slight disappointment current deep"},{"from":2961.23,"to":2963.12,"location":2,"content":"learning frameworks don't quite give you"},{"from":2963.12,"to":2965.01,"location":2,"content":"that there was actually a famous"},{"from":2965.01,"to":2966.75,"location":2,"content":"framework that attempted to give you"},{"from":2966.75,"to":2969.18,"location":2,"content":"that so the theano framework that was"},{"from":2969.18,"to":2971.27,"location":2,"content":"developed at the University of Montreal"},{"from":2971.27,"to":2973.68,"location":2,"content":"those they've now abandoned in the"},{"from":2973.68,"to":2976.08,"location":2,"content":"modern era of large technology"},{"from":2976.08,"to":2977.79,"location":2,"content":"corporation deep learning frameworks"},{"from":2977.79,"to":2980.7,"location":2,"content":"Tiano did precisely that it did the full"},{"from":2980.7,"to":2984.03,"location":2,"content":"thing of automatic differentiation for"},{"from":2984.03,"to":2986.4,"location":2,"content":"reasons that we could either think of"},{"from":2986.4,"to":2988.41,"location":2,"content":"good or bad current deep learning"},{"from":2988.41,"to":2990.81,"location":2,"content":"frameworks like tensor flow or PI torch"},{"from":2990.81,"to":2993.03,"location":2,"content":"actually do a little bit less than that"},{"from":2993.03,"to":2996.24,"location":2,"content":"so what they do is say well for an"},{"from":2996.24,"to":2998.49,"location":2,"content":"individual for the computations at an"},{"from":2998.49,"to":3001.4,"location":2,"content":"individual node you have to do the"},{"from":3001.4,"to":3003.8,"location":2,"content":"calculus for yourself for this"},{"from":3003.8,"to":3006.17,"location":2,"content":"individual node you have to write the"},{"from":3006.17,"to":3009.26,"location":2,"content":"forward propagation say you know return"},{"from":3009.26,"to":3012.59,"location":2,"content":"x plus y and you have to write the"},{"from":3012.59,"to":3014.63,"location":2,"content":"backward propagation saying the local"},{"from":3014.63,"to":3019.48,"location":2,"content":"gradients 1 and 1/2 the 2 inputs x and y"},{"from":3019.48,"to":3023.27,"location":2,"content":"but providing you or someone else has"},{"from":3023.27,"to":3025.48,"location":2,"content":"written out the forward and backward"},{"from":3025.48,"to":3028.91,"location":2,"content":"local step at this node then tensorflow"},{"from":3028.91,"to":3031.43,"location":2,"content":"or pi torch does all the rest of it for"},{"from":3031.43,"to":3033.14,"location":2,"content":"you and runs the back propagation"},{"from":3033.14,"to":3036.92,"location":2,"content":"algorithm and then you know effectively"},{"from":3036.92,"to":3039.92,"location":2,"content":"that sort of saves you having to have a"},{"from":3039.92,"to":3042.97,"location":2,"content":"big symbolic computation engine because"},{"from":3042.97,"to":3046.93,"location":2,"content":"somewhat the person coding the node"},{"from":3046.93,"to":3049.94,"location":2,"content":"computations is writing a bit of code as"},{"from":3049.94,"to":3051.65,"location":2,"content":"you might normally imagine doing it"},{"from":3051.65,"to":3054.26,"location":2,"content":"whether in you know C or Pascal of"},{"from":3054.26,"to":3058.31,"location":2,"content":"saying return ik x plus y and you know"},{"from":3058.31,"to":3061.4,"location":2,"content":"local gradient return 1 right and and"},{"from":3061.4,"to":3062.96,"location":2,"content":"you don't actually have to have a whole"},{"from":3062.96,"to":3066.92,"location":2,"content":"symbolic computation engine okay so that"},{"from":3066.92,"to":3068.57,"location":2,"content":"means the overall picture looks like"},{"from":3068.57,"to":3072.47,"location":2,"content":"this right so schematically we have a"},{"from":3072.47,"to":3076.13,"location":2,"content":"computation graph and to calculate the"},{"from":3076.13,"to":3077.13,"location":2,"content":"for"},{"from":3077.13,"to":3082.57,"location":2,"content":"computation we sort of put inputs into"},{"from":3082.57,"to":3084.43,"location":2,"content":"our computation graphed where there's"},{"from":3084.43,"to":3087.37,"location":2,"content":"sort of X and y variables and then we"},{"from":3087.37,"to":3090.88,"location":2,"content":"run through the nodes and topologically"},{"from":3090.88,"to":3094.54,"location":2,"content":"sorted order and for each node we"},{"from":3094.54,"to":3097.69,"location":2,"content":"calculate its forward and necessarily"},{"from":3097.69,"to":3099.46,"location":2,"content":"the things that depends on then have"},{"from":3099.46,"to":3101.41,"location":2,"content":"already been computed and we just do"},{"from":3101.41,"to":3104.05,"location":2,"content":"expression evaluation forward and then"},{"from":3104.05,"to":3107.8,"location":2,"content":"we return the final gate in the graph"},{"from":3107.8,"to":3110.17,"location":2,"content":"which is our loss function or objective"},{"from":3110.17,"to":3112.84,"location":2,"content":"function but then also we have the"},{"from":3112.84,"to":3115.48,"location":2,"content":"backward pass and for the backward pass"},{"from":3115.48,"to":3117.51,"location":2,"content":"we go in the nodes in Reverse"},{"from":3117.51,"to":3120.88,"location":2,"content":"topological only sorted order and for"},{"from":3120.88,"to":3123.61,"location":2,"content":"each of those nodes we've returned their"},{"from":3123.61,"to":3126.67,"location":2,"content":"backward value and for the top node we"},{"from":3126.67,"to":3129.01,"location":2,"content":"return backward value of one and that"},{"from":3129.01,"to":3131.95,"location":2,"content":"will then give us our gradients and so"},{"from":3131.95,"to":3137.26,"location":2,"content":"that means for any node any piece of"},{"from":3137.26,"to":3139.75,"location":2,"content":"computation that we perform we need to"},{"from":3139.75,"to":3143.5,"location":2,"content":"write a little bit of code that says"},{"from":3143.5,"to":3145.3,"location":2,"content":"what it's doing on the forward pass and"},{"from":3145.3,"to":3147.82,"location":2,"content":"what it's doing on the backward pass so"},{"from":3147.82,"to":3151.27,"location":2,"content":"on the forward pass this is our"},{"from":3151.27,"to":3153.19,"location":2,"content":"multiplication so we're just saying"},{"from":3153.19,"to":3156.61,"location":2,"content":"return x times y so that's pretty easy"},{"from":3156.61,"to":3158.74,"location":2,"content":"that's what you're used to doing but"},{"from":3158.74,"to":3161.71,"location":2,"content":"well we also need to do the backward"},{"from":3161.71,"to":3164.62,"location":2,"content":"pass as local gradients of return what"},{"from":3164.62,"to":3168.34,"location":2,"content":"is the partial of L with respect to Z"},{"from":3168.34,"to":3171.28,"location":2,"content":"and with respect to X and well to do"},{"from":3171.28,"to":3173.29,"location":2,"content":"that we have to do a little bit more"},{"from":3173.29,"to":3175.81,"location":2,"content":"work so we have to do a little bit more"},{"from":3175.81,"to":3178.75,"location":2,"content":"work first of all on the forward pass so"},{"from":3178.75,"to":3181.45,"location":2,"content":"in the forward pass we have to remember"},{"from":3181.45,"to":3184.63,"location":2,"content":"to sort of stuff away in some variables"},{"from":3184.63,"to":3186.85,"location":2,"content":"what values we computed in the forth"},{"from":3186.85,"to":3189.4,"location":2,"content":"with what values were given it to us in"},{"from":3189.4,"to":3191.14,"location":2,"content":"the forward pass or else we won't be"},{"from":3191.14,"to":3193.63,"location":2,"content":"able to calculate the backward pass so"},{"from":3193.63,"to":3198.07,"location":2,"content":"we store away the values of x and y and"},{"from":3198.07,"to":3200.26,"location":2,"content":"so then when we're doing the backward"},{"from":3200.26,"to":3203.89,"location":2,"content":"pass we are passed into us the upstream"},{"from":3203.89,"to":3206.32,"location":2,"content":"gradient the error signal and now we"},{"from":3206.32,"to":3210.19,"location":2,"content":"just do calculate up"},{"from":3210.19,"to":3212.47,"location":2,"content":"extreme gradient times local gradient"},{"from":3212.47,"to":3214.78,"location":2,"content":"upstream gradient times local gradient"},{"from":3214.78,"to":3219.55,"location":2,"content":"and we return backwards those downstream"},{"from":3219.55,"to":3223.3,"location":2,"content":"gradients and so providing we do that"},{"from":3223.3,"to":3226.63,"location":2,"content":"for all the nodes of our graph we then"},{"from":3226.63,"to":3229.51,"location":2,"content":"have something that the system can run"},{"from":3229.51,"to":3232.69,"location":2,"content":"for us as a deep learning system and so"},{"from":3232.69,"to":3236.05,"location":2,"content":"what that means in practice is that you"},{"from":3236.05,"to":3237.64,"location":2,"content":"know any of these deep learning"},{"from":3237.64,"to":3239.74,"location":2,"content":"frameworks come with a whole box of"},{"from":3239.74,"to":3242.74,"location":2,"content":"tools it says here is a fully connected"},{"from":3242.74,"to":3245.47,"location":2,"content":"forward layer here is a sigmoid unit"},{"from":3245.47,"to":3247.63,"location":2,"content":"here is other more complicated things"},{"from":3247.63,"to":3249.58,"location":2,"content":"we'll do later like convolutions and"},{"from":3249.58,"to":3251.74,"location":2,"content":"recurrent layers and to the extent that"},{"from":3251.74,"to":3254.32,"location":2,"content":"you're using one of those somebody else"},{"from":3254.32,"to":3256.3,"location":2,"content":"has done this work for you right that"},{"from":3256.3,"to":3261.31,"location":2,"content":"they've defined nodes or a layer of"},{"from":3261.31,"to":3263.74,"location":2,"content":"nodes that have forward and backward"},{"from":3263.74,"to":3266.32,"location":2,"content":"already written foot for them and to the"},{"from":3266.32,"to":3269.8,"location":2,"content":"extent that that's true that means that"},{"from":3269.8,"to":3271.99,"location":2,"content":"making neural nets is heaps of farmers"},{"from":3271.99,"to":3273.52,"location":2,"content":"just like Lego right you just stick"},{"from":3273.52,"to":3275.98,"location":2,"content":"these layers together and say got it on"},{"from":3275.98,"to":3277.48,"location":2,"content":"some data and train it you know it's so"},{"from":3277.48,"to":3279.52,"location":2,"content":"easy that my high school student is"},{"from":3279.52,"to":3281.59,"location":2,"content":"building these things right you don't"},{"from":3281.59,"to":3284.26,"location":2,"content":"have to understand much really but you"},{"from":3284.26,"to":3285.73,"location":2,"content":"know to the extent that you actually"},{"from":3285.73,"to":3287.41,"location":2,"content":"want to do some original research and"},{"from":3287.41,"to":3289.12,"location":2,"content":"think I've got this really cool idea of"},{"from":3289.12,"to":3291.04,"location":2,"content":"how to do things differently I'm going"},{"from":3291.04,"to":3292.72,"location":2,"content":"to define my own kind of different"},{"from":3292.72,"to":3295.09,"location":2,"content":"computation well then you have to do"},{"from":3295.09,"to":3298.18,"location":2,"content":"this and define your class and as well"},{"from":3298.18,"to":3299.71,"location":2,"content":"as sort of saying how to compute the"},{"from":3299.71,"to":3302.08,"location":2,"content":"forward value you have to pull out your"},{"from":3302.08,"to":3304.39,"location":2,"content":"copy of Wolfram Alpha and work out what"},{"from":3304.39,"to":3306.64,"location":2,"content":"the derivatives are and put that into"},{"from":3306.64,"to":3310.39,"location":2,"content":"the backward pass yeah okay"},{"from":3310.39,"to":3312.76,"location":2,"content":"so here's just one little more note on"},{"from":3312.76,"to":3317.23,"location":2,"content":"that you know in the early days of deep"},{"from":3317.23,"to":3321.34,"location":2,"content":"learning say prior to 2014 what we"},{"from":3321.34,"to":3323.29,"location":2,"content":"always just to say to everybody very"},{"from":3323.29,"to":3325.87,"location":2,"content":"sternly is you should check all your"},{"from":3325.87,"to":3328.18,"location":2,"content":"gradients by doing numeric gradient"},{"from":3328.18,"to":3330.88,"location":2,"content":"checks it's really really important and"},{"from":3330.88,"to":3334.42,"location":2,"content":"so what that meant was well you know if"},{"from":3334.42,"to":3336.22,"location":2,"content":"you want to know whether you've coded"},{"from":3336.22,"to":3339.31,"location":2,"content":"your backward pass right an easy way to"},{"from":3339.31,"to":3342.94,"location":2,"content":"check whether you've coded it right is"},{"from":3342.94,"to":3344.05,"location":2,"content":"to do"},{"from":3344.05,"to":3347.53,"location":2,"content":"this numeric gradient where you're sort"},{"from":3347.53,"to":3350.32,"location":2,"content":"of estimating the slope by wiggling it a"},{"from":3350.32,"to":3352.87,"location":2,"content":"bit and wiggling the input a bit and"},{"from":3352.87,"to":3355.51,"location":2,"content":"seeing what effect it has so I'm working"},{"from":3355.51,"to":3357.97,"location":2,"content":"out the value of the function that f of"},{"from":3357.97,"to":3360.88,"location":2,"content":"X plus h for H very small like e to the"},{"from":3360.88,"to":3364.33,"location":2,"content":"minus 4 and an f of X minus H and then"},{"from":3364.33,"to":3366.37,"location":2,"content":"dividing by 2h and I'm saying well what"},{"from":3366.37,"to":3368.2,"location":2,"content":"is the slope at this point and I'm"},{"from":3368.2,"to":3370.24,"location":2,"content":"getting a numerical estimate of the"},{"from":3370.24,"to":3373.99,"location":2,"content":"gradient with respect to my variable X"},{"from":3373.99,"to":3377.5,"location":2,"content":"here now so this is what you will have"},{"from":3377.5,"to":3380.05,"location":2,"content":"seen in high school when you did the"},{"from":3380.05,"to":3382.84,"location":2,"content":"sort of first estimates of gradients"},{"from":3382.84,"to":3384.88,"location":2,"content":"where you sort of worked out f of X plus"},{"from":3384.88,"to":3387.49,"location":2,"content":"h divided by H and you're doing rise"},{"from":3387.49,"to":3389.98,"location":2,"content":"over run and got a point estimate of the"},{"from":3389.98,"to":3393.22,"location":2,"content":"gradient exactly the same thing except"},{"from":3393.22,"to":3396.25,"location":2,"content":"for the fact in this case rather than"},{"from":3396.25,"to":3398.77,"location":2,"content":"doing it one-sided like that we're doing"},{"from":3398.77,"to":3401.17,"location":2,"content":"it two-sided it turns out that if you"},{"from":3401.17,"to":3403.42,"location":2,"content":"actually want to do this two-sided is"},{"from":3403.42,"to":3407.38,"location":2,"content":"asymptotically hugely better and so"},{"from":3407.38,"to":3409.3,"location":2,"content":"you're always better off doing two-sided"},{"from":3409.3,"to":3411.67,"location":2,"content":"gradient checks rather than one side of"},{"from":3411.67,"to":3414.73,"location":2,"content":"gradient checks so since you saw that"},{"from":3414.73,"to":3416.47,"location":2,"content":"since it's hard to implement this wrong"},{"from":3416.47,"to":3418.57,"location":2,"content":"this is a good way to check that your"},{"from":3418.57,"to":3420.61,"location":2,"content":"gradients are correct if you've defined"},{"from":3420.61,"to":3425.02,"location":2,"content":"them yourselves as a technique to use it"},{"from":3425.02,"to":3428.29,"location":2,"content":"for anything it's completely completely"},{"from":3428.29,"to":3431.05,"location":2,"content":"hopeless because we're thinking of doing"},{"from":3431.05,"to":3433.75,"location":2,"content":"this over our deep learning model for a"},{"from":3433.75,"to":3436.69,"location":2,"content":"fully connected layer what this means is"},{"from":3436.69,"to":3439.03,"location":2,"content":"that if you've got this sort of like a W"},{"from":3439.03,"to":3442.11,"location":2,"content":"matrix of N by M and you want to"},{"from":3442.11,"to":3446.08,"location":2,"content":"calculate your partial derivatives to"},{"from":3446.08,"to":3448.3,"location":2,"content":"check if they're correct it means that"},{"from":3448.3,"to":3450.58,"location":2,"content":"you have to do this for every element of"},{"from":3450.58,"to":3452.92,"location":2,"content":"the matrix so you have to calculate the"},{"from":3452.92,"to":3456.19,"location":2,"content":"eventual loss first jiggling w11 then"},{"from":3456.19,"to":3459.76,"location":2,"content":"jiggling w12 then jiggling 1 w 1 3 1 4"},{"from":3459.76,"to":3462.7,"location":2,"content":"etc so you have in the complex network"},{"from":3462.7,"to":3464.47,"location":2,"content":"you'll end up literally doing millions"},{"from":3464.47,"to":3466.87,"location":2,"content":"of function evaluations to check the"},{"from":3466.87,"to":3470.59,"location":2,"content":"gradients at one point in time so you"},{"from":3470.59,"to":3472.87,"location":2,"content":"know it's it's not like what I"},{"from":3472.87,"to":3474.97,"location":2,"content":"advertised for back prop when I said"},{"from":3474.97,"to":3477.75,"location":2,"content":"it's just as efficient as calculating"},{"from":3477.75,"to":3481.8,"location":2,"content":"the forward value doing this is forward"},{"from":3481.8,"to":3484.59,"location":2,"content":"value computation time x number of"},{"from":3484.59,"to":3486.81,"location":2,"content":"parameters in our model which is often"},{"from":3486.81,"to":3489.06,"location":2,"content":"huge for deep learning networks so this"},{"from":3489.06,"to":3490.65,"location":2,"content":"is something that you only want to have"},{"from":3490.65,"to":3493.5,"location":2,"content":"in side if statements that you could"},{"from":3493.5,"to":3495.45,"location":2,"content":"turn off so you could just so run it to"},{"from":3495.45,"to":3497.34,"location":2,"content":"check that your code isn't brick"},{"from":3497.34,"to":3501.99,"location":2,"content":"I am D buggy you know in honesty this is"},{"from":3501.99,"to":3504.03,"location":2,"content":"just much less needed now because you"},{"from":3504.03,"to":3505.65,"location":2,"content":"know by and large you can plug together"},{"from":3505.65,"to":3507.81,"location":2,"content":"your components and layers and PI torch"},{"from":3507.81,"to":3511.53,"location":2,"content":"and other people wrote the code right"},{"from":3511.53,"to":3514.17,"location":2,"content":"and it will work so you probably don't"},{"from":3514.17,"to":3515.82,"location":2,"content":"need to do this all the time but it is"},{"from":3515.82,"to":3517.95,"location":2,"content":"still a useful thing to look at to know"},{"from":3517.95,"to":3523.53,"location":2,"content":"about if things are going wrong yeah"},{"from":3523.53,"to":3525.48,"location":2,"content":"okay so if we've now mastered the core"},{"from":3525.48,"to":3527.31,"location":2,"content":"technology of neural nets we saw know"},{"from":3527.31,"to":3529.62,"location":2,"content":"basically everything we need to know"},{"from":3529.62,"to":3532.1,"location":2,"content":"about neural nets and I sort of just"},{"from":3532.1,"to":3535.56,"location":2,"content":"summarized it there just to sort of"},{"from":3535.56,"to":3542.07,"location":2,"content":"emphasize once more you know I think"},{"from":3542.07,"to":3545.55,"location":2,"content":"some people think why do we even need to"},{"from":3545.55,"to":3547.68,"location":2,"content":"learn all this stuff about gradients and"},{"from":3547.68,"to":3549.6,"location":2,"content":"there's a sense in which so don't really"},{"from":3549.6,"to":3551.01,"location":2,"content":"because these modern deep learning"},{"from":3551.01,"to":3553.32,"location":2,"content":"frameworks will compute all the"},{"from":3553.32,"to":3555.63,"location":2,"content":"gradients for you you know we make you"},{"from":3555.63,"to":3558.24,"location":2,"content":"suffer in homework 2 but in homework 3"},{"from":3558.24,"to":3560.85,"location":2,"content":"you can have your gradients and computed"},{"from":3560.85,"to":3563.4,"location":2,"content":"for you but you know I so you know it's"},{"from":3563.4,"to":3565.14,"location":2,"content":"sort of just like well why should you"},{"from":3565.14,"to":3568.05,"location":2,"content":"take a class on compilers right that"},{"from":3568.05,"to":3570.12,"location":2,"content":"there's actually something useful in"},{"from":3570.12,"to":3572.76,"location":2,"content":"understanding what goes on under the"},{"from":3572.76,"to":3575.25,"location":2,"content":"hood even though most of the time we're"},{"from":3575.25,"to":3577.32,"location":2,"content":"just perfectly happy to let the C"},{"from":3577.32,"to":3578.52,"location":2,"content":"compiler do its thing"},{"from":3578.52,"to":3581.58,"location":2,"content":"without being experts on x86 assembler"},{"from":3581.58,"to":3584.55,"location":2,"content":"every day of the world week but you know"},{"from":3584.55,"to":3586.16,"location":2,"content":"there is more to it than that"},{"from":3586.16,"to":3588.21,"location":2,"content":"you know because even though back"},{"from":3588.21,"to":3590.31,"location":2,"content":"propagation is great once you're"},{"from":3590.31,"to":3592.53,"location":2,"content":"building complex models back propagation"},{"from":3592.53,"to":3595.56,"location":2,"content":"doesn't always work as you would expect"},{"from":3595.56,"to":3597.87,"location":2,"content":"it to perfectly it may be the wrong word"},{"from":3597.87,"to":3599.67,"location":2,"content":"because you know mathematically it's"},{"from":3599.67,"to":3602.16,"location":2,"content":"perfect but it might not be achieving"},{"from":3602.16,"to":3604.35,"location":2,"content":"what you're wanting it to and well if"},{"from":3604.35,"to":3606.3,"location":2,"content":"you want to sort of in debug and improve"},{"from":3606.3,"to":3608.16,"location":2,"content":"models it's kind of crucial to"},{"from":3608.16,"to":3610.35,"location":2,"content":"understand what's going on there's a"},{"from":3610.35,"to":3611.31,"location":2,"content":"nice medium"},{"from":3611.31,"to":3613.83,"location":2,"content":"by Andre Kapaa fee of yes you should"},{"from":3613.83,"to":3616.8,"location":2,"content":"understand backdrop but on the syllabus"},{"from":3616.8,"to":3621.29,"location":2,"content":"page that talks about this and indeed"},{"from":3621.29,"to":3622.95,"location":2,"content":"week after next"},{"from":3622.95,"to":3624.75,"location":2,"content":"Abby is actually going to lecture about"},{"from":3624.75,"to":3626.67,"location":2,"content":"recurrent neural networks and you know"},{"from":3626.67,"to":3629.25,"location":2,"content":"one of the places where you can easily"},{"from":3629.25,"to":3632.61,"location":2,"content":"fail and doing that propagation turns up"},{"from":3632.61,"to":3637.71,"location":2,"content":"there it's a good example ok does anyone"},{"from":3637.71,"to":3639.9,"location":2,"content":"have any questions about back"},{"from":3639.9,"to":3646.89,"location":2,"content":"propagation and computation graphs okay"},{"from":3646.89,"to":3651.69,"location":2,"content":"if not the remainder of the time is the"},{"from":3651.69,"to":3653.97,"location":2,"content":"grab bag of things that you really"},{"from":3653.97,"to":3656.16,"location":2,"content":"should know about if you're going to be"},{"from":3656.16,"to":3658.53,"location":2,"content":"doing deep learning and so yeah this is"},{"from":3658.53,"to":3661.64,"location":2,"content":"just itsy-bitsy and but let me say them"},{"from":3661.64,"to":3666.06,"location":2,"content":"so up until now when we've had loss"},{"from":3666.06,"to":3669,"location":2,"content":"functions and we've been maximizing the"},{"from":3669,"to":3671.28,"location":2,"content":"likelihood of our data and stuff like"},{"from":3671.28,"to":3673.86,"location":2,"content":"that we've sort of just had this part"},{"from":3673.86,"to":3676.8,"location":2,"content":"here which is the likelihood of our data"},{"from":3676.8,"to":3682.25,"location":2,"content":"and we've worked to maximize it however"},{"from":3682.25,"to":3687.36,"location":2,"content":"in practice that works badly usually and"},{"from":3687.36,"to":3689.52,"location":2,"content":"we need to do something else which is"},{"from":3689.52,"to":3692.61,"location":2,"content":"regularize our models and if you've done"},{"from":3692.61,"to":3695.01,"location":2,"content":"the machine learning class or something"},{"from":3695.01,"to":3696.27,"location":2,"content":"like that you will have seen"},{"from":3696.27,"to":3698.91,"location":2,"content":"regularization and there are various"},{"from":3698.91,"to":3701.93,"location":2,"content":"techniques to do regularization but"},{"from":3701.93,"to":3704.34,"location":2,"content":"compared to anything else regularization"},{"from":3704.34,"to":3707.79,"location":2,"content":"is even more important for deep learning"},{"from":3707.79,"to":3711.99,"location":2,"content":"models right so the general idea is if"},{"from":3711.99,"to":3713.7,"location":2,"content":"you have a lot of parameters in your"},{"from":3713.7,"to":3716.96,"location":2,"content":"model those parameters can just"},{"from":3716.96,"to":3719.58,"location":2,"content":"essentially memorize what's in the data"},{"from":3719.58,"to":3721.8,"location":2,"content":"that you trained it and so they're very"},{"from":3721.8,"to":3724.62,"location":2,"content":"good at predicting the answers the model"},{"from":3724.62,"to":3726.12,"location":2,"content":"becomes very good at predicting their"},{"from":3726.12,"to":3728.76,"location":2,"content":"answers to the data you trained it on"},{"from":3728.76,"to":3732.21,"location":2,"content":"but the model may become poor at working"},{"from":3732.21,"to":3734.49,"location":2,"content":"in the real world and different examples"},{"from":3734.49,"to":3738.14,"location":2,"content":"and somehow we want to stop that and"},{"from":3738.14,"to":3741,"location":2,"content":"this problem is especially bad for deep"},{"from":3741,"to":3743.22,"location":2,"content":"learning models because typically deep"},{"from":3743.22,"to":3744.66,"location":2,"content":"learning models have"},{"from":3744.66,"to":3746.91,"location":2,"content":"vast numbers of parameters so in the"},{"from":3746.91,"to":3748.92,"location":2,"content":"good old days when statisticians ruled"},{"from":3748.92,"to":3752.16,"location":2,"content":"the show they told people that it was"},{"from":3752.16,"to":3754.95,"location":2,"content":"completely ridiculous to have a number"},{"from":3754.95,"to":3757.08,"location":2,"content":"of parameters that approached your"},{"from":3757.08,"to":3758.91,"location":2,"content":"number of training examples you know you"},{"from":3758.91,"to":3760.53,"location":2,"content":"should never have more parameters in"},{"from":3760.53,"to":3762.57,"location":2,"content":"your model than one-tenth of the number"},{"from":3762.57,"to":3764.91,"location":2,"content":"of your training examples from what's"},{"from":3764.91,"to":3767.13,"location":2,"content":"the kind of rules of thumb you are told"},{"from":3767.13,"to":3769.83,"location":2,"content":"so that you had lots of examples with"},{"from":3769.83,"to":3772.53,"location":2,"content":"which Westar made every parameter that's"},{"from":3772.53,"to":3775.08,"location":2,"content":"just not true deep learning models it's"},{"from":3775.08,"to":3777.12,"location":2,"content":"just really common that we train deep"},{"from":3777.12,"to":3779.28,"location":2,"content":"learning models that have ten times as"},{"from":3779.28,"to":3781.41,"location":2,"content":"many parameters as there we have"},{"from":3781.41,"to":3784.65,"location":2,"content":"training examples but miraculously it"},{"from":3784.65,"to":3787.14,"location":2,"content":"works in fact it works brilliantly those"},{"from":3787.14,"to":3790.23,"location":2,"content":"highly over parameterised models and"},{"from":3790.23,"to":3792.69,"location":2,"content":"it's one of the big secret sources of my"},{"from":3792.69,"to":3795.15,"location":2,"content":"deep learning has been so brilliant but"},{"from":3795.15,"to":3797.58,"location":2,"content":"it only works if we regularize the model"},{"from":3797.58,"to":3800.52,"location":2,"content":"so if you train a model without"},{"from":3800.52,"to":3803.82,"location":2,"content":"sufficient regularization what you find"},{"from":3803.82,"to":3806.46,"location":2,"content":"is that your training it and working out"},{"from":3806.46,"to":3809.43,"location":2,"content":"your loss on the training data and the"},{"from":3809.43,"to":3811.02,"location":2,"content":"model keeps on getting better and better"},{"from":3811.02,"to":3812.72,"location":2,"content":"and better and better"},{"from":3812.72,"to":3816.6,"location":2,"content":"necessarily our algorithm has to improve"},{"from":3816.6,"to":3818.67,"location":2,"content":"loss on the training data so the worst"},{"from":3818.67,"to":3820.29,"location":2,"content":"thing that could happen is that the"},{"from":3820.29,"to":3822.93,"location":2,"content":"graph could become absolutely fat flat"},{"from":3822.93,"to":3825.9,"location":2,"content":"what you will find is with most models"},{"from":3825.9,"to":3828.27,"location":2,"content":"that we train they have so many"},{"from":3828.27,"to":3830.55,"location":2,"content":"parameters that this will just keep on"},{"from":3830.55,"to":3833.49,"location":2,"content":"going down until the loss is sort of"},{"from":3833.49,"to":3835.35,"location":2,"content":"approaching the numerical precision of"},{"from":3835.35,"to":3837.48,"location":2,"content":"zero if you leave a training for long"},{"from":3837.48,"to":3839.97,"location":2,"content":"enough it just learns the correct answer"},{"from":3839.97,"to":3842.07,"location":2,"content":"that every example because because"},{"from":3842.07,"to":3844.77,"location":2,"content":"effectively can memorize examples okay"},{"from":3844.77,"to":3847.65,"location":2,"content":"but if you then say let me test out this"},{"from":3847.65,"to":3850.05,"location":2,"content":"model on some different data what you"},{"from":3850.05,"to":3853.17,"location":2,"content":"find is this red curve that up until a"},{"from":3853.17,"to":3857.37,"location":2,"content":"certain point that you're also building"},{"from":3857.37,"to":3858.99,"location":2,"content":"a model that's better at predicting on"},{"from":3858.99,"to":3861.54,"location":2,"content":"different data but after some point this"},{"from":3861.54,"to":3863.94,"location":2,"content":"curve starts to curve up again and"},{"from":3863.94,"to":3865.23,"location":2,"content":"ignore that bit where it seems to curve"},{"from":3865.23,"to":3866.73,"location":2,"content":"down again that was a mistake in the"},{"from":3866.73,"to":3869.34,"location":2,"content":"drawing and so this has then referred to"},{"from":3869.34,"to":3873.23,"location":2,"content":"as overfitting that them from here on"},{"from":3873.23,"to":3876.03,"location":2,"content":"the training model was just learning to"},{"from":3876.03,"to":3878.52,"location":2,"content":"memorize whatever was in"},{"from":3878.52,"to":3880.35,"location":2,"content":"the training data but not in a way that"},{"from":3880.35,"to":3884.97,"location":2,"content":"lets it generalize to other examples and"},{"from":3884.97,"to":3887.25,"location":2,"content":"so this is not what we want we want to"},{"from":3887.25,"to":3889.68,"location":2,"content":"try and avoid overfitting as much as"},{"from":3889.68,"to":3891.36,"location":2,"content":"possible and there are various"},{"from":3891.36,"to":3893.37,"location":2,"content":"regularization techniques that we use"},{"from":3893.37,"to":3896.01,"location":2,"content":"for that and a simple starting one is"},{"from":3896.01,"to":3898.74,"location":2,"content":"this one here where we penalize the"},{"from":3898.74,"to":3902.13,"location":2,"content":"log-likelihood by saying you're going to"},{"from":3902.13,"to":3905.01,"location":2,"content":"be penalized to the extent that you move"},{"from":3905.01,"to":3908.13,"location":2,"content":"parameters away from zero so the default"},{"from":3908.13,"to":3910.38,"location":2,"content":"state of nature is all parameters are"},{"from":3910.38,"to":3913.14,"location":2,"content":"zero so they're ignored in computations"},{"from":3913.14,"to":3915.36,"location":2,"content":"you can have parameters that have big"},{"from":3915.36,"to":3918.27,"location":2,"content":"values but you'll be penalized a bit for"},{"from":3918.27,"to":3920.04,"location":2,"content":"and this is referred to as l2"},{"from":3920.04,"to":3922.59,"location":2,"content":"regularization and you know that's sort"},{"from":3922.59,"to":3924.18,"location":2,"content":"of a starting point of something"},{"from":3924.18,"to":3925.17,"location":2,"content":"sensible you could do with"},{"from":3925.17,"to":3926.22,"location":2,"content":"regularization"},{"from":3926.22,"to":3929.1,"location":2,"content":"but there's more to say later and we'll"},{"from":3929.1,"to":3931.53,"location":2,"content":"talk in the sort of lecture before we"},{"from":3931.53,"to":3933.81,"location":2,"content":"discuss final projects of other clever"},{"from":3933.81,"to":3936.24,"location":2,"content":"and regularization techniques and neural"},{"from":3936.24,"to":3940.52,"location":2,"content":"networks ok grab bag number two"},{"from":3940.52,"to":3944.07,"location":2,"content":"vectorization is the term that you hear"},{"from":3944.07,"to":3946.95,"location":2,"content":"but it's not only vectors this is also"},{"from":3946.95,"to":3950.43,"location":2,"content":"matrix ization and higher dimensional"},{"from":3950.43,"to":3953.19,"location":2,"content":"matrices what are called tensors in this"},{"from":3953.19,"to":3956.25,"location":2,"content":"field tensor ization getting deep"},{"from":3956.25,"to":3958.65,"location":2,"content":"learning systems to run fast and"},{"from":3958.65,"to":3962.84,"location":2,"content":"efficiently is only possible if we"},{"from":3962.84,"to":3966.63,"location":2,"content":"vectorize things and what does that mean"},{"from":3966.63,"to":3969.96,"location":2,"content":"what that means is you know the"},{"from":3969.96,"to":3971.73,"location":2,"content":"straightforward way to write a lot of"},{"from":3971.73,"to":3975.21,"location":2,"content":"code that you saw in your first CS class"},{"from":3975.21,"to":3979.95,"location":2,"content":"is you say for I in range in calculate"},{"from":3979.95,"to":3985.11,"location":2,"content":"random Randi 1 but when we want to be"},{"from":3985.11,"to":3991.94,"location":2,"content":"clever people that are doing things fast"},{"from":3991.94,"to":3995.64,"location":2,"content":"we say rather than work out this w dot"},{"from":3995.64,"to":3999.33,"location":2,"content":"one-word vector at a time and do it in a"},{"from":3999.33,"to":4002.45,"location":2,"content":"for loop we could instead put all of our"},{"from":4002.45,"to":4005.6,"location":2,"content":"word vectors into one matrix and then do"},{"from":4005.6,"to":4009.86,"location":2,"content":"simply one matrix matrix multiply of W"},{"from":4009.86,"to":4012.53,"location":2,"content":"by our word vector matrix"},{"from":4012.53,"to":4016.31,"location":2,"content":"and even if you run your code on your"},{"from":4016.31,"to":4020,"location":2,"content":"laptop on the CPU you will find out that"},{"from":4020,"to":4020.81,"location":2,"content":"if you do it"},{"from":4020.81,"to":4023.3,"location":2,"content":"the vectorized way things will become"},{"from":4023.3,"to":4025.97,"location":2,"content":"hugely faster so in this example it"},{"from":4025.97,"to":4027.73,"location":2,"content":"became over an order of magnitude faster"},{"from":4027.73,"to":4030.53,"location":2,"content":"when doing it with a vector vectorized"},{"from":4030.53,"to":4034.4,"location":2,"content":"rather than with a for loop and those"},{"from":4034.4,"to":4037.31,"location":2,"content":"gains are only compounded when we run"},{"from":4037.31,"to":4040.28,"location":2,"content":"code on a GPU that you will get no gains"},{"from":4040.28,"to":4043.01,"location":2,"content":"and speed at all on a GPU unless your"},{"from":4043.01,"to":4044.69,"location":2,"content":"code is vectorized but if it is"},{"from":4044.69,"to":4046.67,"location":2,"content":"vectorized then you can hope to have"},{"from":4046.67,"to":4048.53,"location":2,"content":"results of how oh yeah this runs 40"},{"from":4048.53,"to":4053.35,"location":2,"content":"times faster than it did on the CPU okay"},{"from":4053.35,"to":4056.45,"location":2,"content":"yeah so always try to use vectors and"},{"from":4056.45,"to":4060.26,"location":2,"content":"matrices not for loops of course it's"},{"from":4060.26,"to":4061.94,"location":2,"content":"useful when developing stuff to time"},{"from":4061.94,"to":4065.63,"location":2,"content":"your code and find out what's slow okay"},{"from":4065.63,"to":4070.76,"location":2,"content":"point three okay so we discussed this"},{"from":4070.76,"to":4075.22,"location":2,"content":"idea last time and the time before that"},{"from":4075.22,"to":4078.89,"location":2,"content":"after after having the sort of a fine"},{"from":4078.89,"to":4082.16,"location":2,"content":"layer where we took you know go from X"},{"from":4082.16,"to":4084.62,"location":2,"content":"to W X plus B that's referred to as an"},{"from":4084.62,"to":4086.53,"location":2,"content":"affine layer so we're doing this"},{"from":4086.53,"to":4089.45,"location":2,"content":"multiplying a vector by matrix matrix"},{"from":4089.45,"to":4093.92,"location":2,"content":"and adding biases we necessarily to have"},{"from":4093.92,"to":4096.85,"location":2,"content":"power in a deep network have to have"},{"from":4096.85,"to":4099.53,"location":2,"content":"some form of non-linearity"},{"from":4099.53,"to":4101.48,"location":2,"content":"and so I just wanted to go through a bit"},{"from":4101.48,"to":4104.3,"location":2,"content":"of background on non-linearity isn't"},{"from":4104.3,"to":4107.78,"location":2,"content":"what people use and what to use so if"},{"from":4107.78,"to":4109.91,"location":2,"content":"you're sort of starting from the idea of"},{"from":4109.91,"to":4112.63,"location":2,"content":"what we know as logistic regression"},{"from":4112.63,"to":4115.04,"location":2,"content":"what's commonly referred to as the"},{"from":4115.04,"to":4117.71,"location":2,"content":"sigmoid curve or maybe more precisely"},{"from":4117.71,"to":4121.67,"location":2,"content":"look the logistic function is this"},{"from":4121.67,"to":4124.15,"location":2,"content":"picture here so it's something that"},{"from":4124.15,"to":4127.04,"location":2,"content":"squashes any real number positive or"},{"from":4127.04,"to":4129.71,"location":2,"content":"negative into the range zero to one it"},{"from":4129.71,"to":4133.34,"location":2,"content":"gives you a probability output these"},{"from":4133.34,"to":4137.3,"location":2,"content":"this use of this logistic function was"},{"from":4137.3,"to":4139.91,"location":2,"content":"really really common in early neural"},{"from":4139.91,"to":4142.13,"location":2,"content":"nets if you go back to 80s 90s neural"},{"from":4142.13,"to":4142.55,"location":2,"content":"Nets"},{"from":4142.55,"to":4146.03,"location":2,"content":"there were sigmoid functions absolutely"},{"from":4146.03,"to":4151.37,"location":2,"content":"everywhere in more recent times 90% of"},{"from":4151.37,"to":4153.83,"location":2,"content":"the time nobody uses these and they've"},{"from":4153.83,"to":4155.21,"location":2,"content":"been found to sort of actually work"},{"from":4155.21,"to":4157.79,"location":2,"content":"quite poorly the only place these are"},{"from":4157.79,"to":4162.11,"location":2,"content":"used is when you actually want a value"},{"from":4162.11,"to":4164.96,"location":2,"content":"between 0 & 1 as your output so we'll"},{"from":4164.96,"to":4167.57,"location":2,"content":"talk later about how you have gating in"},{"from":4167.57,"to":4169.76,"location":2,"content":"networks and so gating as a place where"},{"from":4169.76,"to":4171.59,"location":2,"content":"you want to have a probability between"},{"from":4171.59,"to":4173.96,"location":2,"content":"two things and then you will use one of"},{"from":4173.96,"to":4176.15,"location":2,"content":"those but you use them absolutely"},{"from":4176.15,"to":4180.92,"location":2,"content":"nowhere else here is the 10 H curve so"},{"from":4180.92,"to":4184.13,"location":2,"content":"the formula for 10h looks like a scary"},{"from":4184.13,"to":4186.02,"location":2,"content":"thing with lots of Exponential's in it"},{"from":4186.02,"to":4189.02,"location":2,"content":"and it doesn't really look much like a"},{"from":4189.02,"to":4193.7,"location":2,"content":"logistic curve whatsoever but if you dig"},{"from":4193.7,"to":4195.59,"location":2,"content":"up your math textbook you can convince"},{"from":4195.59,"to":4198.56,"location":2,"content":"yourself that a 10h curve is actually"},{"from":4198.56,"to":4200.78,"location":2,"content":"exactly the same as the logistic curve"},{"from":4200.78,"to":4204.44,"location":2,"content":"apart from you multiply it by two so it"},{"from":4204.44,"to":4206.6,"location":2,"content":"has a range of two rather than one and"},{"from":4206.6,"to":4208.91,"location":2,"content":"you shift it down one so this is sort of"},{"from":4208.91,"to":4211.13,"location":2,"content":"just a rescaled logistic but it's now"},{"from":4211.13,"to":4214.16,"location":2,"content":"symmetric between 1 and -1 and the fact"},{"from":4214.16,"to":4216.14,"location":2,"content":"it's symmetric in the output actually"},{"from":4216.14,"to":4218.66,"location":2,"content":"helps a lot for putting into neural"},{"from":4218.66,"to":4223.04,"location":2,"content":"networks so teenagers are still"},{"from":4223.04,"to":4225.44,"location":2,"content":"reasonably widely used and quite a"},{"from":4225.44,"to":4229.46,"location":2,"content":"number of places in neural networks so"},{"from":4229.46,"to":4231.47,"location":2,"content":"10h should be a friend of yours and you"},{"from":4231.47,"to":4234.14,"location":2,"content":"should know about that but you know one"},{"from":4234.14,"to":4237.58,"location":2,"content":"of the bad things about using"},{"from":4237.58,"to":4239.75,"location":2,"content":"transcendental functions like the"},{"from":4239.75,"to":4242.6,"location":2,"content":"sigmoid or 10h is you know they involve"},{"from":4242.6,"to":4247.37,"location":2,"content":"this expensive math operations that slow"},{"from":4247.37,"to":4250.61,"location":2,"content":"you down like it's sort of a nuisance to"},{"from":4250.61,"to":4252.56,"location":2,"content":"be kind of computing Exponential's and"},{"from":4252.56,"to":4254.42,"location":2,"content":"ten HS and new computer things are kind"},{"from":4254.42,"to":4257.78,"location":2,"content":"of slow so people started playing around"},{"from":4257.78,"to":4260.42,"location":2,"content":"with ways of to make things faster and"},{"from":4260.42,"to":4262.85,"location":2,"content":"so someone came up with this idea like"},{"from":4262.85,"to":4265.51,"location":2,"content":"maybe we could come up with a hard 10 H"},{"from":4265.51,"to":4268.22,"location":2,"content":"where it's just sort of flat out here"},{"from":4268.22,"to":4270.86,"location":2,"content":"and then it has a linear slope and then"},{"from":4270.86,"to":4272.69,"location":2,"content":"it's flat at the top you know it sort of"},{"from":4272.69,"to":4275.27,"location":2,"content":"looks like a 10 H but we've just squared"},{"from":4275.27,"to":4278.27,"location":2,"content":"it off and well this is really cheap to"},{"from":4278.27,"to":4279.81,"location":2,"content":"compute right you say"},{"from":4279.81,"to":4285.78,"location":2,"content":"less than -1 return -1 return +1 or just"},{"from":4285.78,"to":4287.75,"location":2,"content":"return the number no complex"},{"from":4287.75,"to":4290.82,"location":2,"content":"transcendentals the funny thing is it"},{"from":4290.82,"to":4292.17,"location":2,"content":"turns out that this actually works"},{"from":4292.17,"to":4295.32,"location":2,"content":"pretty well you might be scared and you"},{"from":4295.32,"to":4297.81,"location":2,"content":"might justify ibly b squared because if"},{"from":4297.81,"to":4300.84,"location":2,"content":"you start thinking about gradients once"},{"from":4300.84,"to":4302.46,"location":2,"content":"you're over here there's no gradient"},{"from":4302.46,"to":4307.22,"location":2,"content":"right it's tube completely flat at 0 so"},{"from":4307.22,"to":4309.99,"location":2,"content":"things go dead as soon as they're out at"},{"from":4309.99,"to":4311.94,"location":2,"content":"one of the ends so it's sort of"},{"from":4311.94,"to":4313.59,"location":2,"content":"important to stay in this middle section"},{"from":4313.59,"to":4315.99,"location":2,"content":"at least for a while and then it's just"},{"from":4315.99,"to":4318.84,"location":2,"content":"got a slope of 1 right it's a constant"},{"from":4318.84,"to":4321.36,"location":2,"content":"slope of 1 but this is enough of a"},{"from":4321.36,"to":4324.84,"location":2,"content":"linearity that actually it works well in"},{"from":4324.84,"to":4327.72,"location":2,"content":"neural networks and you can train neural"},{"from":4327.72,"to":4330.72,"location":2,"content":"networks so that sent the whole field in"},{"from":4330.72,"to":4333.18,"location":2,"content":"the opposite direction and people"},{"from":4333.18,"to":4337.05,"location":2,"content":"thought oh if that works maybe we can"},{"from":4337.05,"to":4339.81,"location":2,"content":"make things even simpler and that led to"},{"from":4339.81,"to":4342.3,"location":2,"content":"the now-famous what's referred to"},{"from":4342.3,"to":4345.09,"location":2,"content":"everywhere as a reloj sorry I miss"},{"from":4345.09,"to":4347.04,"location":2,"content":"there's a mistake in my editing there"},{"from":4347.04,"to":4349.65,"location":2,"content":"delete off hard 10 H that was in the"},{"from":4349.65,"to":4352.62,"location":2,"content":"slides by mistake the r-la unit everyone"},{"from":4352.62,"to":4354.36,"location":2,"content":"calls it reloj which stands for"},{"from":4354.36,"to":4358.5,"location":2,"content":"rectified linear unit so the the reloj"},{"from":4358.5,"to":4360.06,"location":2,"content":"is essentially the simplest"},{"from":4360.06,"to":4363.77,"location":2,"content":"non-linearity you can have so the relu"},{"from":4363.77,"to":4368.07,"location":2,"content":"is zero slope zero as soon as you're in"},{"from":4368.07,"to":4370.5,"location":2,"content":"the negative regime and it's just a line"},{"from":4370.5,"to":4372.72,"location":2,"content":"slope one when you're in the positive"},{"from":4372.72,"to":4376.71,"location":2,"content":"regime I mean when I first saw this I"},{"from":4376.71,"to":4378.6,"location":2,"content":"mean it sort of blew my mind it could"},{"from":4378.6,"to":4380.97,"location":2,"content":"possibly work because it's sort of I"},{"from":4380.97,"to":4383.55,"location":2,"content":"guess I was brought up on these sort of"},{"from":4383.55,"to":4386.37,"location":2,"content":"10 HS and sigmoids and there's sort of"},{"from":4386.37,"to":4388.35,"location":2,"content":"these arguments about the slope and you"},{"from":4388.35,"to":4391.62,"location":2,"content":"get these gradients and you can move"},{"from":4391.62,"to":4394.53,"location":2,"content":"around with the gradient and how is it"},{"from":4394.53,"to":4396.48,"location":2,"content":"meant to work if half of this function"},{"from":4396.48,"to":4399,"location":2,"content":"just has output zero and no gradient and"},{"from":4399,"to":4400.83,"location":2,"content":"the other half is just this straight"},{"from":4400.83,"to":4403.83,"location":2,"content":"line and in particular when you're in"},{"from":4403.83,"to":4406.41,"location":2,"content":"the positive regime this is just an"},{"from":4406.41,"to":4409.8,"location":2,"content":"identity function and you know I sort of"},{"from":4409.8,"to":4412.23,"location":2,"content":"argued before that if you've just"},{"from":4412.23,"to":4413.13,"location":2,"content":"compose"},{"from":4413.13,"to":4415.89,"location":2,"content":"linear transforms you don't get any"},{"from":4415.89,"to":4418.29,"location":2,"content":"power but providing we're in these the"},{"from":4418.29,"to":4421.05,"location":2,"content":"right-hand part of the regime since this"},{"from":4421.05,"to":4422.82,"location":2,"content":"is an identity function that's exactly"},{"from":4422.82,"to":4424.32,"location":2,"content":"what we're doing we're just composing"},{"from":4424.32,"to":4427.2,"location":2,"content":"linear transforms so you sort of believe"},{"from":4427.2,"to":4429.42,"location":2,"content":"it just couldn't possibly work but it"},{"from":4429.42,"to":4431.06,"location":2,"content":"turns out that this works brilliantly"},{"from":4431.06,"to":4435.39,"location":2,"content":"and this is now by far the default"},{"from":4435.39,"to":4438.3,"location":2,"content":"choice when people are building feed for"},{"from":4438.3,"to":4441.11,"location":2,"content":"deep networks that people use really"},{"from":4441.11,"to":4444.54,"location":2,"content":"nonlinearities and they are very fast"},{"from":4444.54,"to":4448.05,"location":2,"content":"they train very quickly and they perform"},{"from":4448.05,"to":4450.72,"location":2,"content":"very well and so effectively you know"},{"from":4450.72,"to":4453.45,"location":2,"content":"the it is it is simply just each you"},{"from":4453.45,"to":4456.33,"location":2,"content":"depending on the inputs each unit is"},{"from":4456.33,"to":4458.82,"location":2,"content":"just either dead or it's passing things"},{"from":4458.82,"to":4460.89,"location":2,"content":"on as an identity function but there's"},{"from":4460.89,"to":4463.86,"location":2,"content":"enough of a linear non-linearity that"},{"from":4463.86,"to":4465.33,"location":2,"content":"you can do arbitrary function"},{"from":4465.33,"to":4467.49,"location":2,"content":"approximation still with a deep learning"},{"from":4467.49,"to":4469.83,"location":2,"content":"network and people now make precisely"},{"from":4469.83,"to":4473,"location":2,"content":"the opposite argument which is because"},{"from":4473,"to":4479.13,"location":2,"content":"this unit just has a slope of 1 over"},{"from":4479.13,"to":4481.98,"location":2,"content":"it's nonzero range that means the"},{"from":4481.98,"to":4484.68,"location":2,"content":"gradient is passed back very efficiently"},{"from":4484.68,"to":4487.56,"location":2,"content":"to the inputs and therefore them the"},{"from":4487.56,"to":4490.74,"location":2,"content":"models train very efficiently whereas"},{"from":4490.74,"to":4493.11,"location":2,"content":"when you were with these kind of curves"},{"from":4493.11,"to":4495.15,"location":2,"content":"when you're over here there's very"},{"from":4495.15,"to":4497.43,"location":2,"content":"little slope so your models might train"},{"from":4497.43,"to":4502.02,"location":2,"content":"very slowly ok so you know for"},{"from":4502.02,"to":4503.91,"location":2,"content":"feed-forward network try this before you"},{"from":4503.91,"to":4506.4,"location":2,"content":"try anything else but there's sort of"},{"from":4506.4,"to":4508.68,"location":2,"content":"them being a sub literature that says"},{"from":4508.68,"to":4511.44,"location":2,"content":"well maybe that's too simple and we"},{"from":4511.44,"to":4513.57,"location":2,"content":"could do a bit better and so that led to"},{"from":4513.57,"to":4516.09,"location":2,"content":"the leaky reloj which said maybe we"},{"from":4516.09,"to":4518.1,"location":2,"content":"should put a tiny bit of slope over here"},{"from":4518.1,"to":4520.29,"location":2,"content":"so it's not completely dead so you can"},{"from":4520.29,"to":4524.01,"location":2,"content":"make it something like 1 1/100 as the"},{"from":4524.01,"to":4526.35,"location":2,"content":"slope is this part and then people had"},{"from":4526.35,"to":4529.11,"location":2,"content":"well let's build off that maybe we could"},{"from":4529.11,"to":4531.48,"location":2,"content":"actually put another parameter into our"},{"from":4531.48,"to":4533.28,"location":2,"content":"neural network and we could have a"},{"from":4533.28,"to":4536.37,"location":2,"content":"parametric reloj so there's some slope"},{"from":4536.37,"to":4539.25,"location":2,"content":"over here but we're also going to back"},{"from":4539.25,"to":4543.39,"location":2,"content":"propagate into our non-linearity which"},{"from":4543.39,"to":4545.67,"location":2,"content":"has this extra alpha parameter which is"},{"from":4545.67,"to":4546.8,"location":2,"content":"how many much"},{"from":4546.8,"to":4550.22,"location":2,"content":"and so variously people have used these"},{"from":4550.22,"to":4553.34,"location":2,"content":"you can sort of find tin papers on"},{"from":4553.34,"to":4555.5,"location":2,"content":"archive where people say you can get"},{"from":4555.5,"to":4557.03,"location":2,"content":"better results from using one or other"},{"from":4557.03,"to":4559.73,"location":2,"content":"of these you can also find papers where"},{"from":4559.73,"to":4561.53,"location":2,"content":"people say it made no difference for"},{"from":4561.53,"to":4564.23,"location":2,"content":"them versus just using a reloj so I"},{"from":4564.23,"to":4566.33,"location":2,"content":"think basically you can start off with a"},{"from":4566.33,"to":4570.37,"location":2,"content":"reloj and work from there yeah so"},{"from":4570.37,"to":4574.97,"location":2,"content":"parameter initialization it's when so"},{"from":4574.97,"to":4576.32,"location":2,"content":"when we have these matrices of"},{"from":4576.32,"to":4579.62,"location":2,"content":"parameters in our model it's vital vital"},{"from":4579.62,"to":4584.14,"location":2,"content":"vital that you have to initialize those"},{"from":4584.14,"to":4586.73,"location":2,"content":"parameter weights with small random"},{"from":4586.73,"to":4589.85,"location":2,"content":"values this was precisely the lesson"},{"from":4589.85,"to":4591.89,"location":2,"content":"that some people hadn't discovered when"},{"from":4591.89,"to":4593.93,"location":2,"content":"it came to final project time so I'll"},{"from":4593.93,"to":4597.68,"location":2,"content":"emphasize it vital vital so if you just"},{"from":4597.68,"to":4599.57,"location":2,"content":"start off with the weights being zero"},{"from":4599.57,"to":4601.73,"location":2,"content":"you kind of have these complete"},{"from":4601.73,"to":4603.8,"location":2,"content":"symmetries right that everything will be"},{"from":4603.8,"to":4606.26,"location":2,"content":"calculated the same everything will move"},{"from":4606.26,"to":4608.35,"location":2,"content":"the same and you're not actually"},{"from":4608.35,"to":4611.51,"location":2,"content":"training this complex network with a lot"},{"from":4611.51,"to":4613.55,"location":2,"content":"of Units that are specializing to learn"},{"from":4613.55,"to":4615.74,"location":2,"content":"different things so somehow you have to"},{"from":4615.74,"to":4618.17,"location":2,"content":"break the symmetry and we do that by"},{"from":4618.17,"to":4621.02,"location":2,"content":"giving small random weights so you know"},{"from":4621.02,"to":4623.27,"location":2,"content":"there's sort of some fine points when"},{"from":4623.27,"to":4625.22,"location":2,"content":"you have biases you may as well just"},{"from":4625.22,"to":4627.38,"location":2,"content":"start them at zero as neutral and see"},{"from":4627.38,"to":4629.78,"location":2,"content":"how the system learn the bias that you"},{"from":4629.78,"to":4633.2,"location":2,"content":"want et cetera but in general the"},{"from":4633.2,"to":4637.04,"location":2,"content":"weights you want to initialize the small"},{"from":4637.04,"to":4641.6,"location":2,"content":"random values you'll find in height or"},{"from":4641.6,"to":4644.87,"location":2,"content":"other deep learning practice packages a"},{"from":4644.87,"to":4647.45,"location":2,"content":"common initialization that's used and"},{"from":4647.45,"to":4649.24,"location":2,"content":"often recommended as this Xavier"},{"from":4649.24,"to":4652.67,"location":2,"content":"initialization and so the trick of this"},{"from":4652.67,"to":4656.36,"location":2,"content":"is that for a lot of models in a lot of"},{"from":4656.36,"to":4658.94,"location":2,"content":"places think of some of these things"},{"from":4658.94,"to":4661.85,"location":2,"content":"like these ones and these you'd like the"},{"from":4661.85,"to":4664.31,"location":2,"content":"values in the network to sort of stay"},{"from":4664.31,"to":4668.6,"location":2,"content":"small in this sort of middle range here"},{"from":4668.6,"to":4671.12,"location":2,"content":"and well if you kind of have a matrix"},{"from":4671.12,"to":4675.31,"location":2,"content":"with big values in it and you multiply"},{"from":4675.31,"to":4678.05,"location":2,"content":"vector by this matrix you know things"},{"from":4678.05,"to":4679.7,"location":2,"content":"might get bigger and then if you put in"},{"from":4679.7,"to":4680.69,"location":2,"content":"through another layer"},{"from":4680.69,"to":4682.76,"location":2,"content":"get bigger again and then sort of"},{"from":4682.76,"to":4685.19,"location":2,"content":"everything I'll be too big and you'll"},{"from":4685.19,"to":4687.22,"location":2,"content":"have problems so really Xavier"},{"from":4687.22,"to":4689.75,"location":2,"content":"initializations seeking to avoid that by"},{"from":4689.75,"to":4693.59,"location":2,"content":"saying how many inputs are there to this"},{"from":4693.59,"to":4696.8,"location":2,"content":"node how many outputs are there we want"},{"from":4696.8,"to":4698.45,"location":2,"content":"to sort of tamp it down the"},{"from":4698.45,"to":4701,"location":2,"content":"initialization based on the inputs and"},{"from":4701,"to":4703.37,"location":2,"content":"the outputs because effectively we'll be"},{"from":4703.37,"to":4707.57,"location":2,"content":"using this number that many times it's a"},{"from":4707.57,"to":4710.65,"location":2,"content":"good thing to use you can use that"},{"from":4710.65,"to":4712.66,"location":2,"content":"optimizers"},{"from":4712.66,"to":4716.54,"location":2,"content":"up till now we saw just talked about"},{"from":4716.54,"to":4721.52,"location":2,"content":"plain SGD you know normally plain SGD"},{"from":4721.52,"to":4724.88,"location":2,"content":"actually works just fine but often if"},{"from":4724.88,"to":4727.34,"location":2,"content":"you want to use just plain SGD you have"},{"from":4727.34,"to":4729.68,"location":2,"content":"to spend time tuning the learning rate"},{"from":4729.68,"to":4732.73,"location":2,"content":"that alpha that we multiplied the"},{"from":4732.73,"to":4736.22,"location":2,"content":"gradient by for complex nets and"},{"from":4736.22,"to":4738.77,"location":2,"content":"situations or to avoid worry there's"},{"from":4738.77,"to":4740.78,"location":2,"content":"sort of now this big family and more"},{"from":4740.78,"to":4744.79,"location":2,"content":"sophisticated adaptive optimizers and so"},{"from":4744.79,"to":4746.93,"location":2,"content":"effectively they're scaling the"},{"from":4746.93,"to":4749,"location":2,"content":"parameter adjustment by accumulated"},{"from":4749,"to":4751.1,"location":2,"content":"gradients which have the effect that"},{"from":4751.1,"to":4754.13,"location":2,"content":"they learn per parameter learning rate"},{"from":4754.13,"to":4756.86,"location":2,"content":"so that they conceived which parameters"},{"from":4756.86,"to":4759.23,"location":2,"content":"would be useful to move more and which"},{"from":4759.23,"to":4761.18,"location":2,"content":"ones less depending on the sensitivity"},{"from":4761.18,"to":4763.49,"location":2,"content":"of those parameters so where things are"},{"from":4763.49,"to":4765.41,"location":2,"content":"flat you can be trying to move quickly"},{"from":4765.41,"to":4767.63,"location":2,"content":"where things are bouncing around a lot"},{"from":4767.63,"to":4768.95,"location":2,"content":"you can be trying to move just a little"},{"from":4768.95,"to":4771.11,"location":2,"content":"so as not to overshoot and so there's a"},{"from":4771.11,"to":4773.21,"location":2,"content":"whole family of these a degrade rmsprop"},{"from":4773.21,"to":4775.4,"location":2,"content":"Atom they're actually other ones there's"},{"from":4775.4,"to":4778.16,"location":2,"content":"a de Max and a lot of them I mean Adam"},{"from":4778.16,"to":4780.53,"location":2,"content":"is one fairly reliable one that many"},{"from":4780.53,"to":4783.89,"location":2,"content":"people use and that's not bad and then"},{"from":4783.89,"to":4786.14,"location":2,"content":"one more slide and I'm done yeah so"},{"from":4786.14,"to":4789.74,"location":2,"content":"learning rates so normally you have to"},{"from":4789.74,"to":4792.56,"location":2,"content":"choose a learning rate so one choice is"},{"from":4792.56,"to":4794.93,"location":2,"content":"just have a constant learning rate you"},{"from":4794.93,"to":4796.85,"location":2,"content":"pick a number maybe 10 to the minus 3"},{"from":4796.85,"to":4800.57,"location":2,"content":"and say that's my learning rate you want"},{"from":4800.57,"to":4802.34,"location":2,"content":"your learning rate to be order of"},{"from":4802.34,"to":4805.16,"location":2,"content":"magnitude right if your learning rate is"},{"from":4805.16,"to":4809.72,"location":2,"content":"too big your model might diverge or not"},{"from":4809.72,"to":4812.24,"location":2,"content":"converge because adjusts of leaps you"},{"from":4812.24,"to":4813.92,"location":2,"content":"around by huge cramp"},{"from":4813.92,"to":4816.76,"location":2,"content":"movements and you completely miss the"},{"from":4816.76,"to":4819.86,"location":2,"content":"good parts of your function space if"},{"from":4819.86,"to":4821.69,"location":2,"content":"your model if your learning rate is too"},{"from":4821.69,"to":4825.35,"location":2,"content":"small your model may not train by the"},{"from":4825.35,"to":4827.45,"location":2,"content":"assignment deadline and then you'll be"},{"from":4827.45,"to":4830.39,"location":2,"content":"unhappy so if you saw it you know"},{"from":4830.39,"to":4833.72,"location":2,"content":"commonly people sort of try powers of"},{"from":4833.72,"to":4835.82,"location":2,"content":"ten and sees how it looks right they"},{"from":4835.82,"to":4840.56,"location":2,"content":"might try you know 0.0 1.0 01.00 1 and"},{"from":4840.56,"to":4843.5,"location":2,"content":"see look at how the loss is declining"},{"from":4843.5,"to":4845.93,"location":2,"content":"and see what seems to work in general"},{"from":4845.93,"to":4847.76,"location":2,"content":"you want to use the fastest learning"},{"from":4847.76,"to":4849.68,"location":2,"content":"rate that isn't making things become"},{"from":4849.68,"to":4853.46,"location":2,"content":"unstable commonly you get better results"},{"from":4853.46,"to":4857.27,"location":2,"content":"by decreasing the learning rate as you"},{"from":4857.27,"to":4859.73,"location":2,"content":"train so sometimes people just do that"},{"from":4859.73,"to":4862.43,"location":2,"content":"by hand so we used the term epoch for a"},{"from":4862.43,"to":4864.41,"location":2,"content":"full pass through your training data and"},{"from":4864.41,"to":4866.99,"location":2,"content":"people might say half the learning rate"},{"from":4866.99,"to":4869.03,"location":2,"content":"after every three epochs as you train"},{"from":4869.03,"to":4871.55,"location":2,"content":"and that can work pretty well you can"},{"from":4871.55,"to":4874.82,"location":2,"content":"use formulas to get per epoch rate"},{"from":4874.82,"to":4877.79,"location":2,"content":"learning rates there are even fancier"},{"from":4877.79,"to":4880.07,"location":2,"content":"methods you can look up cyclic learning"},{"from":4880.07,"to":4882.65,"location":2,"content":"rates online if you want which sort of"},{"from":4882.65,"to":4884.21,"location":2,"content":"actually makes the learning rate"},{"from":4884.21,"to":4885.98,"location":2,"content":"sometimes bigger and then sometimes"},{"from":4885.98,"to":4887.93,"location":2,"content":"smaller and people have found that that"},{"from":4887.93,"to":4890.09,"location":2,"content":"can be useful for getting you out of bad"},{"from":4890.09,"to":4894.02,"location":2,"content":"regions and interesting ways the one"},{"from":4894.02,"to":4896.21,"location":2,"content":"other thing to know is if you're using"},{"from":4896.21,"to":4899.36,"location":2,"content":"one of the fancier optimizers they still"},{"from":4899.36,"to":4901.58,"location":2,"content":"ask you for a learning rate but that"},{"from":4901.58,"to":4904.7,"location":2,"content":"learning rate is the initial learning"},{"from":4904.7,"to":4907.22,"location":2,"content":"rate which typically the optimizer will"},{"from":4907.22,"to":4910.91,"location":2,"content":"shrink as you train so commonly if"},{"from":4910.91,"to":4913.91,"location":2,"content":"you're using something like atom you"},{"from":4913.91,"to":4916.4,"location":2,"content":"might be starting off by saying the"},{"from":4916.4,"to":4919.13,"location":2,"content":"learning raises zero point one sort of a"},{"from":4919.13,"to":4921.32,"location":2,"content":"bigger number and it'll be shrinking it"},{"from":4921.32,"to":4924.89,"location":2,"content":"later as the training goes along ok all"},{"from":4924.89,"to":4929.41,"location":2,"content":"done see you next week"}]} \ No newline at end of file diff --git a/bcc-en/5.bcc b/bcc-en/5.bcc new file mode 100644 index 0000000000000000000000000000000000000000..8c5e5ad8a4f0c2acb0d28b6d0dc866bb1d31e309 --- /dev/null +++ b/bcc-en/5.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.25,"to":9.31,"location":2,"content":"okay let's get started again okay so"},{"from":9.31,"to":14.5,"location":2,"content":"welcome back to week three of CS 224 in"},{"from":14.5,"to":17.47,"location":2,"content":"okay so we we've got a bit of a change"},{"from":17.47,"to":22.15,"location":2,"content":"of pace today after week two so this"},{"from":22.15,"to":25.27,"location":2,"content":"week in week 3 we're actually going to"},{"from":25.27,"to":29.08,"location":2,"content":"have some human language and so this"},{"from":29.08,"to":31.9,"location":2,"content":"lecture has no partial derivative signs"},{"from":31.9,"to":35.2,"location":2,"content":"in it and so we'll be moving away from"},{"from":35.2,"to":36.52,"location":2,"content":"sort of working out the so"},{"from":36.52,"to":40.57,"location":2,"content":"technicalities of doing neural networks"},{"from":40.57,"to":43.69,"location":2,"content":"and back propagation and the sort of"},{"from":43.69,"to":46.51,"location":2,"content":"math heavy week to and so then this week"},{"from":46.51,"to":49.27,"location":2,"content":"what we actually want well in today's"},{"from":49.27,"to":51.64,"location":2,"content":"lecture we won't look at well what kind"},{"from":51.64,"to":53.98,"location":2,"content":"of structures do human language"},{"from":53.98,"to":56.86,"location":2,"content":"sentences have and how we can build"},{"from":56.86,"to":60.73,"location":2,"content":"models that build that kind of structure"},{"from":60.73,"to":64.84,"location":2,"content":"for sentences that we see so first of"},{"from":64.84,"to":66.7,"location":2,"content":"all I'm going to sort of explain and"},{"from":66.7,"to":69.94,"location":2,"content":"motivate a bit about structure of human"},{"from":69.94,"to":72.07,"location":2,"content":"language sentences so that's kind of"},{"from":72.07,"to":75.16,"location":2,"content":"like linguistics in 20 minutes or"},{"from":75.16,"to":77.32,"location":2,"content":"something then going to particularly"},{"from":77.32,"to":80.17,"location":2,"content":"focus in on dependency grammars and then"},{"from":80.17,"to":81.82,"location":2,"content":"going to present a method for doing"},{"from":81.82,"to":84.25,"location":2,"content":"dependency structure dependency grammar"},{"from":84.25,"to":85.98,"location":2,"content":"parsing called transition based"},{"from":85.98,"to":88.57,"location":2,"content":"dependency parsing and then talk about"},{"from":88.57,"to":92.17,"location":2,"content":"how you can make neural dependency"},{"from":92.17,"to":97,"location":2,"content":"parsers so going on just near a couple"},{"from":97,"to":100.45,"location":2,"content":"of announcements so assignment 2 was do"},{"from":100.45,"to":103.24,"location":2,"content":"one minute ago so I hope everyone"},{"from":103.24,"to":106.66,"location":2,"content":"succeeded in getting assignment two out"},{"from":106.66,"to":109.3,"location":2,"content":"of the way if you still working on it do"},{"from":109.3,"to":111.46,"location":2,"content":"make sure to make use of the office"},{"from":111.46,"to":113.47,"location":2,"content":"hours and get help for that coming out"},{"from":113.47,"to":116.04,"location":2,"content":"just today is assignment three"},{"from":116.04,"to":120.4,"location":2,"content":"assignment three is basically about this"},{"from":120.4,"to":123.85,"location":2,"content":"lecture so in assignment 3 what you're"},{"from":123.85,"to":126.19,"location":2,"content":"doing is building a neural dependency"},{"from":126.19,"to":128.44,"location":2,"content":"parser and so we hope that you can put"},{"from":128.44,"to":130.3,"location":2,"content":"together what you learned about neural"},{"from":130.3,"to":132.64,"location":2,"content":"networks last week and the content of"},{"from":132.64,"to":135.22,"location":2,"content":"today and jump straight right in to"},{"from":135.22,"to":138.41,"location":2,"content":"building a neural dependency parser"},{"from":138.41,"to":139.82,"location":2,"content":"the other thing that happens in"},{"from":139.82,"to":142.91,"location":2,"content":"assignment 3 is that we start using a"},{"from":142.91,"to":145.58,"location":2,"content":"deep learning framework pi torch so for"},{"from":145.58,"to":148.76,"location":2,"content":"doing assignment 3 instruction 0 and"},{"from":148.76,"to":150.71,"location":2,"content":"this is in the PDF that the assignment"},{"from":150.71,"to":153.5,"location":2,"content":"is to install pi torch as a Python"},{"from":153.5,"to":156.92,"location":2,"content":"package and start using that so we've"},{"from":156.92,"to":160.37,"location":2,"content":"attempted to make assignment 3 sort of"},{"from":160.37,"to":163.94,"location":2,"content":"be a highly scaffolded tutorial where"},{"from":163.94,"to":165.65,"location":2,"content":"you can start to learn how to do things"},{"from":165.65,"to":168.83,"location":2,"content":"in pi torch by just writing a few lines"},{"from":168.83,"to":171.77,"location":2,"content":"of code at a time hopefully that works"},{"from":171.77,"to":174.32,"location":2,"content":"out for people if you have any issues"},{"from":174.32,"to":177.77,"location":2,"content":"with that well obviously you can send"},{"from":177.77,"to":180.11,"location":2,"content":"Piazza messages come to office hours I"},{"from":180.11,"to":181.58,"location":2,"content":"mean the one other thing you could think"},{"from":181.58,"to":182.87,"location":2,"content":"of doing is that there's sort of a"},{"from":182.87,"to":185.36,"location":2,"content":"one-hour introduction to PI torch on the"},{"from":185.36,"to":187.16,"location":2,"content":"PI torch site where you down where"},{"from":187.16,"to":189.53,"location":2,"content":"you've directed for installing PI torch"},{"from":189.53,"to":191.42,"location":2,"content":"and you can also look at that if that"},{"from":191.42,"to":196.01,"location":2,"content":"was maybe helpful now the final mentions"},{"from":196.01,"to":199.46,"location":2,"content":"yes so final projects you know we're"},{"from":199.46,"to":201.14,"location":2,"content":"going to sort of focus on those more in"},{"from":201.14,"to":203.51,"location":2,"content":"week 5 but if it's not bad to be"},{"from":203.51,"to":205.19,"location":2,"content":"thinking about things you could do if"},{"from":205.19,"to":207.26,"location":2,"content":"you want do a custom final project you"},{"from":207.26,"to":208.64,"location":2,"content":"know certainly encourage to come and"},{"from":208.64,"to":210.2,"location":2,"content":"talk to me or the TAS"},{"from":210.2,"to":212.48,"location":2,"content":"we have under the sort of office hours"},{"from":212.48,"to":215.3,"location":2,"content":"page on the website a listing of the"},{"from":215.3,"to":218.02,"location":2,"content":"expertise of some of the different TAS"},{"from":218.02,"to":221.42,"location":2,"content":"since I missed my office hours yesterday"},{"from":221.42,"to":223.79,"location":2,"content":"I'm going to have a shortened office"},{"from":223.79,"to":227.27,"location":2,"content":"hour tomorrow from 1/2 to 20 that's at"},{"from":227.27,"to":232.75,"location":2,"content":"the same time as the normal cs2 24 in"},{"from":232.75,"to":235.31,"location":2,"content":"office hours so you can kind of come for"},{"from":235.31,"to":236.84,"location":2,"content":"any reason you want but it might be"},{"from":236.84,"to":238.22,"location":2,"content":"especially good to come to me if you"},{"from":238.22,"to":240.55,"location":2,"content":"want to talk about final projects"},{"from":240.55,"to":245.09,"location":2,"content":"okay so let's leap in and start talking"},{"from":245.09,"to":248.03,"location":2,"content":"about the structure of sentences and so"},{"from":248.03,"to":251.84,"location":2,"content":"I just sort of want to explain something"},{"from":251.84,"to":255.47,"location":2,"content":"about human language sentence structure"},{"from":255.47,"to":257.6,"location":2,"content":"and how people think about that"},{"from":257.6,"to":260.45,"location":2,"content":"structure and what kind of goals then"},{"from":260.45,"to":262.03,"location":2,"content":"people and natural language processing"},{"from":262.03,"to":265.22,"location":2,"content":"have of sort of building structure to"},{"from":265.22,"to":268.28,"location":2,"content":"understand the meaning of sentences all"},{"from":268.28,"to":270.08,"location":2,"content":"of the examples I'm going to give today"},{"from":270.08,"to":271.07,"location":2,"content":"in"},{"from":271.07,"to":273.86,"location":2,"content":"English because that's a language if"},{"from":273.86,"to":275.06,"location":2,"content":"you're all expected to have some"},{"from":275.06,"to":277.31,"location":2,"content":"competence in but this really isn't"},{"from":277.31,"to":279.2,"location":2,"content":"meant to be sort of facts about English"},{"from":279.2,"to":281.54,"location":2,"content":"this is meant to be sort of ideas of how"},{"from":281.54,"to":283.46,"location":2,"content":"you can think about the structure of"},{"from":283.46,"to":285.38,"location":2,"content":"human language sentences that are"},{"from":285.38,"to":288.65,"location":2,"content":"applied to all sorts of languages okay"},{"from":288.65,"to":292.52,"location":2,"content":"and so in general there are two"},{"from":292.52,"to":295.4,"location":2,"content":"different ways that linguists have"},{"from":295.4,"to":297.53,"location":2,"content":"thought about the structure of sentences"},{"from":297.53,"to":299.69,"location":2,"content":"there there's some relations to them one"},{"from":299.69,"to":302.27,"location":2,"content":"of them is called freeze structure or"},{"from":302.27,"to":304.55,"location":2,"content":"freeze structure grammar x' and if you"},{"from":304.55,"to":307.91,"location":2,"content":"vaguely remember from CS 103 if you did"},{"from":307.91,"to":310.52,"location":2,"content":"that when you spent a about a lecture on"},{"from":310.52,"to":313.61,"location":2,"content":"context-free grammars phrase structure"},{"from":313.61,"to":314.99,"location":2,"content":"grammar czar using the tools of"},{"from":314.99,"to":316.79,"location":2,"content":"context-free grammars to put structures"},{"from":316.79,"to":319.25,"location":2,"content":"over sentences and so I'm first of all"},{"from":319.25,"to":321.8,"location":2,"content":"going to just briefly introduce that so"},{"from":321.8,"to":323.45,"location":2,"content":"you've seen it but actually the main"},{"from":323.45,"to":325.76,"location":2,"content":"tool that we're going to use in this"},{"from":325.76,"to":328.64,"location":2,"content":"class and for assignment 3 is to put"},{"from":328.64,"to":331.64,"location":2,"content":"dependency structures over sentences so"},{"from":331.64,"to":334.07,"location":2,"content":"I'll then go about that so the idea of"},{"from":334.07,"to":337.01,"location":2,"content":"phrase structure is to say the sentences"},{"from":337.01,"to":339.35,"location":2,"content":"are built out of units that"},{"from":339.35,"to":341.72,"location":2,"content":"progressively nests so we start off with"},{"from":341.72,"to":345.41,"location":2,"content":"words that cap cuddly etc and then we're"},{"from":345.41,"to":347.03,"location":2,"content":"going to put them into bigger units that"},{"from":347.03,"to":349.52,"location":2,"content":"we call phrases like the cuddly cat by"},{"from":349.52,"to":351.53,"location":2,"content":"the door and then you can keep on"},{"from":351.53,"to":353.51,"location":2,"content":"combining those up into even bigger"},{"from":353.51,"to":357.22,"location":2,"content":"phrases like the cub the cat by the door"},{"from":357.22,"to":361.46,"location":2,"content":"whoops okay that's that so how does this"},{"from":361.46,"to":364.52,"location":2,"content":"work well so the idea of it and this is"},{"from":364.52,"to":367.01,"location":2,"content":"sort of the way a linguist thinks is to"},{"from":367.01,"to":370.07,"location":2,"content":"say well here's this language which you"},{"from":370.07,"to":372.95,"location":2,"content":"know might not be English it might be"},{"from":372.95,"to":374.75,"location":2,"content":"Oaxacan or some other language"},{"from":374.75,"to":377.36,"location":2,"content":"what kind of structure does it have and"},{"from":377.36,"to":380.69,"location":2,"content":"well we could look at lots of sentences"},{"from":380.69,"to":383.3,"location":2,"content":"of the language and so the linguist is"},{"from":383.3,"to":387.29,"location":2,"content":"going to think well I can see patterns"},{"from":387.29,"to":391.13,"location":2,"content":"like duck head our dog the dog a cat etc"},{"from":391.13,"to":393.02,"location":2,"content":"so it sort of seems like there's one"},{"from":393.02,"to":395.93,"location":2,"content":"word class here which linguists often"},{"from":395.93,"to":398.45,"location":2,"content":"refer to as determiners they're also"},{"from":398.45,"to":400.16,"location":2,"content":"referred to as articles sometimes in"},{"from":400.16,"to":402.29,"location":2,"content":"English and there's another word class"},{"from":402.29,"to":403.41,"location":2,"content":"here"},{"from":403.41,"to":407.04,"location":2,"content":"nouns and so what I to capture this"},{"from":407.04,"to":409.77,"location":2,"content":"pattern here it seems like we can make"},{"from":409.77,"to":413.13,"location":2,"content":"this unit that I see all over the place"},{"from":413.13,"to":417.5,"location":2,"content":"in language which is made of a"},{"from":417.5,"to":420.27,"location":2,"content":"determiner followed by a noun and then"},{"from":420.27,"to":423.03,"location":2,"content":"so I'd write a phrase structure grammar"},{"from":423.03,"to":426.9,"location":2,"content":"rule ie a context-free grammar role of I"},{"from":426.9,"to":428.85,"location":2,"content":"can have a noun phrase that goes to a"},{"from":428.85,"to":430.97,"location":2,"content":"determiner and a noun"},{"from":430.97,"to":434.31,"location":2,"content":"okay but you know that's not the only"},{"from":434.31,"to":439.82,"location":2,"content":"thing that I can see so I can also see"},{"from":439.82,"to":443.4,"location":2,"content":"other examples in my language of the"},{"from":443.4,"to":446.73,"location":2,"content":"large cat or a barking dog or the cuddly"},{"from":446.73,"to":449.55,"location":2,"content":"cat the cuddly dog hmm so that seems"},{"from":449.55,"to":452.43,"location":2,"content":"that I need to put a bit more stuff into"},{"from":452.43,"to":454.65,"location":2,"content":"my grammar so maybe I can say for my"},{"from":454.65,"to":457.5,"location":2,"content":"grammar that a noun phrase goes to a"},{"from":457.5,"to":459.72,"location":2,"content":"determiner and then optionally you can"},{"from":459.72,"to":461.88,"location":2,"content":"put in an adjective and then you can"},{"from":461.88,"to":464.19,"location":2,"content":"have a noun and then I poke around a"},{"from":464.19,"to":466.68,"location":2,"content":"little bit further and I can find"},{"from":466.68,"to":469.65,"location":2,"content":"examples like the cat in a crate or a"},{"from":469.65,"to":472.89,"location":2,"content":"barking dog by the door and I can see"},{"from":472.89,"to":475.47,"location":2,"content":"lots of sentences like this and so I"},{"from":475.47,"to":478.77,"location":2,"content":"want to put those into my grandma but at"},{"from":478.77,"to":480.69,"location":2,"content":"that point I noticed something special"},{"from":480.69,"to":483.92,"location":2,"content":"because look here some other things and"},{"from":483.92,"to":487.23,"location":2,"content":"these things look a lot like the things"},{"from":487.23,"to":489.18,"location":2,"content":"I started off with so it seems like"},{"from":489.18,"to":492.3,"location":2,"content":"which sort of having a phrase with the"},{"from":492.3,"to":495.69,"location":2,"content":"same expansion potential that's misted"},{"from":495.69,"to":498.18,"location":2,"content":"inside this bigger phrase because these"},{"from":498.18,"to":501.36,"location":2,"content":"ones can also be expanded right I could"},{"from":501.36,"to":503.04,"location":2,"content":"have something like the green door or"},{"from":503.04,"to":505.41,"location":2,"content":"something in here so I just want to"},{"from":505.41,"to":507.93,"location":2,"content":"capture that in some way so maybe I"},{"from":507.93,"to":511.62,"location":2,"content":"could say that a noun phrase goes to a"},{"from":511.62,"to":514.89,"location":2,"content":"determiner optionally an adjective and"},{"from":514.89,"to":517.77,"location":2,"content":"noun and then a something else which"},{"from":517.77,"to":519.84,"location":2,"content":"I'll call a prepositional phrase and"},{"from":519.84,"to":521.88,"location":2,"content":"then I'm going to write a second rule"},{"from":521.88,"to":524.49,"location":2,"content":"saying that a prepositional phrase goes"},{"from":524.49,"to":528.48,"location":2,"content":"to a preposition that's going to be"},{"from":528.48,"to":532.74,"location":2,"content":"these words here followed by a noun"},{"from":532.74,"to":535.8,"location":2,"content":"phrase and so then I'm reusing"},{"from":535.8,"to":539.58,"location":2,"content":"oops I'm reusing my noun phrase that I"},{"from":539.58,"to":542.04,"location":2,"content":"defined up here and so then I could"},{"from":542.04,"to":544.65,"location":2,"content":"immediately generate other stuff I can"},{"from":544.65,"to":550.59,"location":2,"content":"sort of say the cat by the large door or"},{"from":550.59,"to":553.98,"location":2,"content":"indeed I could say the cat by the large"},{"from":553.98,"to":558.09,"location":2,"content":"crate the cat by the large crate on the"},{"from":558.09,"to":560.43,"location":2,"content":"table or something like that because"},{"from":560.43,"to":562.08,"location":2,"content":"once I can have a prepositional phrase"},{"from":562.08,"to":564.6,"location":2,"content":"includes a noun phrase and a noun phrase"},{"from":564.6,"to":567,"location":2,"content":"includes a prepositional phrase I've"},{"from":567,"to":569.94,"location":2,"content":"already got something that I can kind of"},{"from":569.94,"to":572.07,"location":2,"content":"recursively go back and forth between"},{"from":572.07,"to":574.71,"location":2,"content":"noun phrases and I can make infinitely"},{"from":574.71,"to":579.96,"location":2,"content":"big sentence that's right yeah yeah so I"},{"from":579.96,"to":584.66,"location":2,"content":"could write something like yeah the cat"},{"from":584.66,"to":600.51,"location":2,"content":"by the large crate on the large table by"},{"from":600.51,"to":604.77,"location":2,"content":"the door right I can keep on going and"},{"from":604.77,"to":607.89,"location":2,"content":"make big sentences and I could say well"},{"from":607.89,"to":611.16,"location":2,"content":"I've got I don't have space to fit it on"},{"from":611.16,"to":613.56,"location":2,"content":"this slide but I've got an analysis of"},{"from":613.56,"to":615.81,"location":2,"content":"this according to my grammar where"},{"from":615.81,"to":618.3,"location":2,"content":"that's a noun phrase goes to determine R"},{"from":618.3,"to":621.21,"location":2,"content":"now prepositional phrase the"},{"from":621.21,"to":622.77,"location":2,"content":"prepositional phrase goes to a"},{"from":622.77,"to":625.74,"location":2,"content":"preposition and a noun phrase and this"},{"from":625.74,"to":627.66,"location":2,"content":"noun phrase goes to determine our"},{"from":627.66,"to":632.66,"location":2,"content":"adjective noun prepositional phrase and"},{"from":632.66,"to":635.37,"location":2,"content":"that goes to a preposition and another"},{"from":635.37,"to":638.1,"location":2,"content":"noun phrase and I keep on going and I"},{"from":638.1,"to":641.82,"location":2,"content":"can produce big sentences okay and you"},{"from":641.82,"to":644.91,"location":2,"content":"know that kind of then continues on"},{"from":644.91,"to":649.47,"location":2,"content":"because you know I can then start seeing"},{"from":649.47,"to":652.17,"location":2,"content":"more bits of grammar so I could say well"},{"from":652.17,"to":655.65,"location":2,"content":"I can now talk to the cat and so if I"},{"from":655.65,"to":658.77,"location":2,"content":"want to capture this talking to a cat"},{"from":658.77,"to":661.56,"location":2,"content":"here well that now means I've got a verb"},{"from":661.56,"to":665.1,"location":2,"content":"because words like talk and walk have"},{"from":665.1,"to":668.13,"location":2,"content":"herbs and then talk to the cat it seems"},{"from":668.13,"to":669.69,"location":2,"content":"like after that could become a"},{"from":669.69,"to":671.91,"location":2,"content":"prepositional phrase and so I could"},{"from":671.91,"to":673.71,"location":2,"content":"write another rule saying that a verb"},{"from":673.71,"to":677.22,"location":2,"content":"phrase goes to a verb followed by a"},{"from":677.22,"to":678.93,"location":2,"content":"prepositional phrase and then I can make"},{"from":678.93,"to":681.57,"location":2,"content":"more bigger sentences like that and I"},{"from":681.57,"to":683.67,"location":2,"content":"could look at more sentences of the"},{"from":683.67,"to":686.82,"location":2,"content":"language and start building up this is"},{"from":686.82,"to":688.98,"location":2,"content":"these context-free grammar rules to"},{"from":688.98,"to":691.85,"location":2,"content":"describe the structure of the language"},{"from":691.85,"to":694.26,"location":2,"content":"and that's part of what linguists do and"},{"from":694.26,"to":696.78,"location":2,"content":"different languages have different"},{"from":696.78,"to":702.47,"location":2,"content":"structures so for example like in this"},{"from":702.47,"to":705.18,"location":2,"content":"little grammar I've had and in general"},{"from":705.18,"to":708.96,"location":2,"content":"in English what you do what you find is"},{"from":708.96,"to":711.27,"location":2,"content":"that prepositional phrases follow the"},{"from":711.27,"to":713.43,"location":2,"content":"verb but if you go to a different"},{"from":713.43,"to":715.65,"location":2,"content":"language like Chinese what you find is"},{"from":715.65,"to":717.09,"location":2,"content":"the prepositional phrases come before"},{"from":717.09,"to":719.43,"location":2,"content":"the verb and so we could say okay there"},{"from":719.43,"to":722.58,"location":2,"content":"are different rules for Chinese and I"},{"from":722.58,"to":723.9,"location":2,"content":"could start writing a context-free"},{"from":723.9,"to":727.74,"location":2,"content":"grammar for them okay beauty so that's"},{"from":727.74,"to":730.11,"location":2,"content":"the idea of context-free grammars and"},{"from":730.11,"to":733.83,"location":2,"content":"actually you know this is the dominant"},{"from":733.83,"to":736.26,"location":2,"content":"approach to linguistic structure that"},{"from":736.26,"to":738.75,"location":2,"content":"you'll see if you go into a linguistics"},{"from":738.75,"to":740.4,"location":2,"content":"class in the Linguistics Department"},{"from":740.4,"to":742.38,"location":2,"content":"people make these kind of phrase"},{"from":742.38,"to":745.35,"location":2,"content":"structure grammar trees but just to be"},{"from":745.35,"to":747.33,"location":2,"content":"contrary no it's not actually just to be"},{"from":747.33,"to":749.76,"location":2,"content":"contrary it's because this alternative"},{"from":749.76,"to":751.62,"location":2,"content":"approach has been very dominant in"},{"from":751.62,"to":754.17,"location":2,"content":"computational linguistics what I'm going"},{"from":754.17,"to":758.73,"location":2,"content":"to show you instead is the viewpoint of"},{"from":758.73,"to":762.11,"location":2,"content":"dependency structure so the idea of"},{"from":762.11,"to":765.42,"location":2,"content":"dependency structure is rather than"},{"from":765.42,"to":767.52,"location":2,"content":"having these sort of phrasal categories"},{"from":767.52,"to":769.53,"location":2,"content":"like noun phrases and prepositional"},{"from":769.53,"to":772.2,"location":2,"content":"phrases and things like that we're going"},{"from":772.2,"to":776.7,"location":2,"content":"to directly represent the structure of"},{"from":776.7,"to":781.53,"location":2,"content":"sentences by saying how words arguments"},{"from":781.53,"to":784.17,"location":2,"content":"or modifiers of other words in a"},{"from":784.17,"to":786.3,"location":2,"content":"recursive faction which is sort of"},{"from":786.3,"to":787.56,"location":2,"content":"another way of saying how their"},{"from":787.56,"to":790.41,"location":2,"content":"dependence of other words so we have a"},{"from":790.41,"to":792.27,"location":2,"content":"sentence look in the large crate in the"},{"from":792.27,"to":795,"location":2,"content":"kitchen by the door and if we want to we"},{"from":795,"to":796.44,"location":2,"content":"can give these word"},{"from":796.44,"to":798.96,"location":2,"content":"words word classes so we can still say"},{"from":798.96,"to":800.88,"location":2,"content":"this is a verb and this is a preposition"},{"from":800.88,"to":803.37,"location":2,"content":"and this is a determiner and this is an"},{"from":803.37,"to":806.28,"location":2,"content":"adjective and this is a noun but to"},{"from":806.28,"to":808.44,"location":2,"content":"represent the structure what we're going"},{"from":808.44,"to":813.57,"location":2,"content":"to say is well look here is the the root"},{"from":813.57,"to":815.7,"location":2,"content":"of this whole sentence so that's where"},{"from":815.7,"to":819.3,"location":2,"content":"things start and then well where we're"},{"from":819.3,"to":822.9,"location":2,"content":"going to look is in the large crate so"},{"from":822.9,"to":829.79,"location":2,"content":"that is a dependent of look and well if"},{"from":829.79,"to":833.37,"location":2,"content":"within we have for the crate it's got"},{"from":833.37,"to":836.01,"location":2,"content":"some modifiers it's a large crate so"},{"from":836.01,"to":838.59,"location":2,"content":"that's a dependent of crate it's a large"},{"from":838.59,"to":842.28,"location":2,"content":"crate that's a dependent of crate and in"},{"from":842.28,"to":844.47,"location":2,"content":"the system of dependencies I'm going to"},{"from":844.47,"to":848.64,"location":2,"content":"show you we've got in as kind of a"},{"from":848.64,"to":851.19,"location":2,"content":"modifier of crate in the large crate I"},{"from":851.19,"to":854.16,"location":2,"content":"could come back to that well but this"},{"from":854.16,"to":856.62,"location":2,"content":"crate has its own modification because"},{"from":856.62,"to":859.17,"location":2,"content":"it's a crate in the kitchen so we have"},{"from":859.17,"to":863.49,"location":2,"content":"in the kitchen as a modifier of crate"},{"from":863.49,"to":866.4,"location":2,"content":"and it's a VAT kitchen in the kitchen"},{"from":866.4,"to":870.84,"location":2,"content":"these are dependence of crate and well"},{"from":870.84,"to":873.57,"location":2,"content":"then we have this next bit by the door"},{"from":873.57,"to":875.79,"location":2,"content":"and as I'll discuss in a minute well"},{"from":875.79,"to":879.54,"location":2,"content":"what is the by the door modifying it's"},{"from":879.54,"to":881.25,"location":2,"content":"still modifying the crate it's saying"},{"from":881.25,"to":883.89,"location":2,"content":"it's a crate by the door okay so that by"},{"from":883.89,"to":887.4,"location":2,"content":"the door is also a pin and upgrade and"},{"from":887.4,"to":890.63,"location":2,"content":"then we've got this structure of"},{"from":890.63,"to":894.63,"location":2,"content":"dependencies coming off of it okay and"},{"from":894.63,"to":897.54,"location":2,"content":"so that's then the structure you get may"},{"from":897.54,"to":899.55,"location":2,"content":"be drawn a little bit more neatly when I"},{"from":899.55,"to":902.49,"location":2,"content":"do it in advance like this and so we"},{"from":902.49,"to":905.25,"location":2,"content":"call these things a dependency structure"},{"from":905.25,"to":910.01,"location":2,"content":"and so crucially what we're doing here"},{"from":910.01,"to":915.03,"location":2,"content":"is that we're sorry I had two different"},{"from":915.03,"to":915.6,"location":2,"content":"examples"},{"from":915.6,"to":919.65,"location":2,"content":"whoops different examples what we're"},{"from":919.65,"to":923.34,"location":2,"content":"doing is saying what what words modify"},{"from":923.34,"to":927.9,"location":2,"content":"other words and so that allows us to"},{"from":927.9,"to":929.39,"location":2,"content":"sort of understand how the"},{"from":929.39,"to":931.37,"location":2,"content":"different parts of the sentence relate"},{"from":931.37,"to":934.7,"location":2,"content":"to each other and so overall you know"},{"from":934.7,"to":937.4,"location":2,"content":"the let me just so say you know you"},{"from":937.4,"to":939.32,"location":2,"content":"might wonder why do we need sentence"},{"from":939.32,"to":942.23,"location":2,"content":"structure you know the way language"},{"from":942.23,"to":944.18,"location":2,"content":"seems to work when you're talking to"},{"from":944.18,"to":946.49,"location":2,"content":"your friends is that you just blabber"},{"from":946.49,"to":948.62,"location":2,"content":"something and they understand what"},{"from":948.62,"to":951.83,"location":2,"content":"you're saying and what goes on beyond"},{"from":951.83,"to":954.65,"location":2,"content":"that is sort of not really accessible to"},{"from":954.65,"to":958.22,"location":2,"content":"consciousness but well to be able to"},{"from":958.22,"to":960.56,"location":2,"content":"have machines that interpret language"},{"from":960.56,"to":963.86,"location":2,"content":"correctly we sort of need to understand"},{"from":963.86,"to":965.96,"location":2,"content":"the structure of these sentences because"},{"from":965.96,"to":968.63,"location":2,"content":"unless we know what words are arguments"},{"from":968.63,"to":970.88,"location":2,"content":"and modifiers of other words we can't"},{"from":970.88,"to":973.25,"location":2,"content":"actually work out what sentences mean"},{"from":973.25,"to":975.32,"location":2,"content":"and I'll show some examples of that as"},{"from":975.32,"to":976.94,"location":2,"content":"to how things go wrong immediately"},{"from":976.94,"to":979.34,"location":2,"content":"because actually a lot of the time there"},{"from":979.34,"to":981.08,"location":2,"content":"are different possible interpretations"},{"from":981.08,"to":983.81,"location":2,"content":"you can have and so in general our goal"},{"from":983.81,"to":985.97,"location":2,"content":"is you know up until now we've sort of"},{"from":985.97,"to":987.89,"location":2,"content":"looked at the meaning of words right we"},{"from":987.89,"to":990.05,"location":2,"content":"did word vectors and we found out words"},{"from":990.05,"to":991.64,"location":2,"content":"there were similar meaning and things"},{"from":991.64,"to":994.25,"location":2,"content":"like that and you can get somewhere and"},{"from":994.25,"to":997.04,"location":2,"content":"human languages with just saying words I"},{"from":997.04,"to":1002.68,"location":2,"content":"mean you can say hi and friendly and"},{"from":1002.68,"to":1004.99,"location":2,"content":"things like that but you can't get very"},{"from":1004.99,"to":1007.3,"location":2,"content":"far with just words right the way human"},{"from":1007.3,"to":1010.51,"location":2,"content":"beings can express complex ideas and"},{"from":1010.51,"to":1012.13,"location":2,"content":"explain and teach things to each other"},{"from":1012.13,"to":1015.52,"location":2,"content":"is you can put together words to express"},{"from":1015.52,"to":1018.19,"location":2,"content":"more complex meanings and then you can"},{"from":1018.19,"to":1021.25,"location":2,"content":"do that over and over again recursively"},{"from":1021.25,"to":1023.41,"location":2,"content":"to build up more and more complex"},{"from":1023.41,"to":1025.57,"location":2,"content":"meanings so that by the time you're"},{"from":1025.57,"to":1027.52,"location":2,"content":"reading the morning newspaper you know"},{"from":1027.52,"to":1029.95,"location":2,"content":"most sentences a sort of 20-30 words"},{"from":1029.95,"to":1033.13,"location":2,"content":"long and they're saying some complex"},{"from":1033.13,"to":1035.17,"location":2,"content":"meaning like you know over night Senate"},{"from":1035.17,"to":1036.85,"location":2,"content":"Republicans resolved that they would not"},{"from":1036.85,"to":1038.86,"location":2,"content":"do blah blah blah and you understand"},{"from":1038.86,"to":1040.93,"location":2,"content":"that flawlessly by just sort of putting"},{"from":1040.93,"to":1043.15,"location":2,"content":"together those meanings of words and so"},{"from":1043.15,"to":1044.56,"location":2,"content":"we need to be able to know what is"},{"from":1044.56,"to":1046.81,"location":2,"content":"connected to what in order to be able to"},{"from":1046.81,"to":1050.02,"location":2,"content":"do that and one of the ways of seeing"},{"from":1050.02,"to":1053.8,"location":2,"content":"that's important is seeing what can go"},{"from":1053.8,"to":1057.16,"location":2,"content":"wrong okay so here is a newspaper"},{"from":1057.16,"to":1061.39,"location":2,"content":"article San Jose cop kills man with"},{"from":1061.39,"to":1062.57,"location":2,"content":"knife"},{"from":1062.57,"to":1065.21,"location":2,"content":"now this has two meanings and the two"},{"from":1065.21,"to":1069.17,"location":2,"content":"meanings depend on well what you decide"},{"from":1069.17,"to":1071.57,"location":2,"content":"depends on what from you what modifies"},{"from":1071.57,"to":1073.46,"location":2,"content":"what so what are the two meanings"},{"from":1073.46,"to":1079.79,"location":2,"content":"meaning one the cop stabs a guy right so"},{"from":1079.79,"to":1082.76,"location":2,"content":"many one is the cop stabs a guy so what"},{"from":1082.76,"to":1085.67,"location":2,"content":"we've got here is we've got the cops"},{"from":1085.67,"to":1088.82,"location":2,"content":"that are killing so this is what we'll"},{"from":1088.82,"to":1092.6,"location":2,"content":"say is the subject of kill is the cops"},{"from":1092.6,"to":1094.58,"location":2,"content":"and I'll just call them the San Jose"},{"from":1094.58,"to":1097.25,"location":2,"content":"cops here and well there's what they"},{"from":1097.25,"to":1102.17,"location":2,"content":"kill which show the the man is an object"},{"from":1102.17,"to":1107.09,"location":2,"content":"of killing and then well one person is"},{"from":1107.09,"to":1109.58,"location":2,"content":"the the cops using knife to kill the"},{"from":1109.58,"to":1114.67,"location":2,"content":"person and so that's then that this is"},{"from":1114.67,"to":1117.71,"location":2,"content":"modifier and here every complex we call"},{"from":1117.71,"to":1120.68,"location":2,"content":"it an instrumental modifier to say that"},{"from":1120.68,"to":1122.59,"location":2,"content":"the cops are killing people with a knife"},{"from":1122.59,"to":1126.2,"location":2,"content":"that's one possible analysis okay then"},{"from":1126.2,"to":1128.18,"location":2,"content":"there's a second meaningless sentence"},{"from":1128.18,"to":1130.34,"location":2,"content":"can have the second meaning the sentence"},{"from":1130.34,"to":1133.52,"location":2,"content":"can have a no okay the second meaning"},{"from":1133.52,"to":1135.98,"location":2,"content":"the sentence can have is it's the man"},{"from":1135.98,"to":1139.94,"location":2,"content":"has a knife so in that case what we want"},{"from":1139.94,"to":1142.07,"location":2,"content":"to say is well you know it's this word"},{"from":1142.07,"to":1149.12,"location":2,"content":"man and this man has noun modifier which"},{"from":1149.12,"to":1151.37,"location":2,"content":"is sort of saying something that the man"},{"from":1151.37,"to":1153.71,"location":2,"content":"possesses and then this dependency is"},{"from":1153.71,"to":1156.7,"location":2,"content":"the same and it's a man with a knife"},{"from":1156.7,"to":1160.07,"location":2,"content":"okay and so the interpretations of these"},{"from":1160.07,"to":1162.68,"location":2,"content":"sentences that you can get depend on"},{"from":1162.68,"to":1164.99,"location":2,"content":"putting different structures over these"},{"from":1164.99,"to":1168.77,"location":2,"content":"sentences in terms of who is what is"},{"from":1168.77,"to":1172.04,"location":2,"content":"modifying what here's another one that's"},{"from":1172.04,"to":1173.86,"location":2,"content":"just like that one"},{"from":1173.86,"to":1178.73,"location":2,"content":"scientists count whales from space okay"},{"from":1178.73,"to":1181.28,"location":2,"content":"so again this sentence has two possible"},{"from":1181.28,"to":1184.55,"location":2,"content":"structures right that we have the"},{"from":1184.55,"to":1186.77,"location":2,"content":"scientists are the subject that"},{"from":1186.77,"to":1190.63,"location":2,"content":"accounting and the whales are the object"},{"from":1190.63,"to":1194.81,"location":2,"content":"and well one possibility is that this is"},{"from":1194.81,"to":1197.41,"location":2,"content":"how they're doing the count"},{"from":1197.41,"to":1199.9,"location":2,"content":"so that they're counting the whales from"},{"from":1199.9,"to":1202.9,"location":2,"content":"space using something like satellite but"},{"from":1202.9,"to":1204.76,"location":2,"content":"the other possibility is that these"},{"from":1204.76,"to":1206.92,"location":2,"content":"parts are the same this is the subject"},{"from":1206.92,"to":1209.62,"location":2,"content":"and this is the object but these are"},{"from":1209.62,"to":1212.92,"location":2,"content":"whales from space which you know we"},{"from":1212.92,"to":1214.66,"location":2,"content":"could have analyzed as a noun phrase"},{"from":1214.66,"to":1219.6,"location":2,"content":"goes to a noun and a PP in our"},{"from":1219.6,"to":1222.28,"location":2,"content":"constituency grammar but it's dependency"},{"from":1222.28,"to":1224.2,"location":2,"content":"grammar we're saying oh this is now a"},{"from":1224.2,"to":1228.97,"location":2,"content":"modifier of the whales and that they are"},{"from":1228.97,"to":1231.88,"location":2,"content":"whales from space that are starting to"},{"from":1231.88,"to":1234.43,"location":2,"content":"turn up as in the bottom example right"},{"from":1234.43,"to":1236.86,"location":2,"content":"so obviously what you want is this one"},{"from":1236.86,"to":1239.91,"location":2,"content":"is correct and this one is here wrong"},{"from":1239.91,"to":1243.49,"location":2,"content":"and so this choice is referred to as a"},{"from":1243.49,"to":1245.38,"location":2,"content":"prepositional phrase attachment"},{"from":1245.38,"to":1247.63,"location":2,"content":"ambiguity and it's one of the most"},{"from":1247.63,"to":1250.48,"location":2,"content":"common and big you ities in the parsing"},{"from":1250.48,"to":1252.79,"location":2,"content":"of English right so here's our"},{"from":1252.79,"to":1255.61,"location":2,"content":"prepositional phrase from space and so"},{"from":1255.61,"to":1257.44,"location":2,"content":"in general when you have prepositional"},{"from":1257.44,"to":1260.73,"location":2,"content":"phrases and before it you have verbs and"},{"from":1260.73,"to":1263.8,"location":2,"content":"noun phrases or nouns that the"},{"from":1263.8,"to":1266.56,"location":2,"content":"prepositional phrase can modify either"},{"from":1266.56,"to":1269.2,"location":2,"content":"of the things that come beforehand right"},{"from":1269.2,"to":1272.17,"location":2,"content":"and so this is a crucial way in which"},{"from":1272.17,"to":1274.3,"location":2,"content":"human languages are different from"},{"from":1274.3,"to":1276.1,"location":2,"content":"programming languages right in"},{"from":1276.1,"to":1279.7,"location":2,"content":"programming languages we have hard rules"},{"from":1279.7,"to":1282.91,"location":2,"content":"as to how you meant to interpret things"},{"from":1282.91,"to":1285.37,"location":2,"content":"that dangle afterwards right so in"},{"from":1285.37,"to":1288.04,"location":2,"content":"programming languages you have an else"},{"from":1288.04,"to":1291.31,"location":2,"content":"is always construed with the closest if"},{"from":1291.31,"to":1293.83,"location":2,"content":"well if that's not what you want you"},{"from":1293.83,"to":1296.11,"location":2,"content":"have to use parentheses or indentation"},{"from":1296.11,"to":1297.61,"location":2,"content":"or something like that I guess it's"},{"from":1297.61,"to":1298.96,"location":2,"content":"different in Python because you have to"},{"from":1298.96,"to":1300.4,"location":2,"content":"use indentation but if we think of"},{"from":1300.4,"to":1302.44,"location":2,"content":"something like C or a similar language"},{"from":1302.44,"to":1306.25,"location":2,"content":"right if you haven't used braces to"},{"from":1306.25,"to":1308.89,"location":2,"content":"indicate it's just deterministically the"},{"from":1308.89,"to":1311.89,"location":2,"content":"else goes with the closest if but that's"},{"from":1311.89,"to":1314.53,"location":2,"content":"not how human languages are human"},{"from":1314.53,"to":1317.41,"location":2,"content":"languages are this prepositional phrase"},{"from":1317.41,"to":1320.44,"location":2,"content":"can go with anything proceeding and the"},{"from":1320.44,"to":1322.66,"location":2,"content":"hearer is assumed to be smart enough to"},{"from":1322.66,"to":1325.24,"location":2,"content":"work out the right one and you know"},{"from":1325.24,"to":1326.95,"location":2,"content":"that's actually put a large part of why"},{"from":1326.95,"to":1329.5,"location":2,"content":"human communication is so a"},{"from":1329.5,"to":1332.53,"location":2,"content":"efficient rave like we can do such a"},{"from":1332.53,"to":1334.21,"location":2,"content":"good job at communicating with each"},{"from":1334.21,"to":1336.82,"location":2,"content":"other because most of the time we don't"},{"from":1336.82,"to":1338.89,"location":2,"content":"have to see very much and there's this"},{"from":1338.89,"to":1341.62,"location":2,"content":"really smart person on the other end who"},{"from":1341.62,"to":1344.32,"location":2,"content":"can interpret the words that we say in"},{"from":1344.32,"to":1348.58,"location":2,"content":"the right way so that's where if you"},{"from":1348.58,"to":1350.38,"location":2,"content":"want to have artificial intelligence and"},{"from":1350.38,"to":1353.08,"location":2,"content":"smart computers we then start to need to"},{"from":1353.08,"to":1356.02,"location":2,"content":"build language understanding devices who"},{"from":1356.02,"to":1359.17,"location":2,"content":"can also work on that basis that they"},{"from":1359.17,"to":1362.62,"location":2,"content":"can just decide what would be the right"},{"from":1362.62,"to":1365.32,"location":2,"content":"thing for from space to modify and if we"},{"from":1365.32,"to":1366.79,"location":2,"content":"have that working really well we can"},{"from":1366.79,"to":1368.14,"location":2,"content":"then apply it back to programming"},{"from":1368.14,"to":1370.33,"location":2,"content":"languages and you could just not put in"},{"from":1370.33,"to":1371.89,"location":2,"content":"any braces and your programming"},{"from":1371.89,"to":1373.99,"location":2,"content":"languages and the compiler would work"},{"from":1373.99,"to":1376.75,"location":2,"content":"out what you meant okay so this is"},{"from":1376.75,"to":1378.76,"location":2,"content":"prepositional phrase a tetraman it's"},{"from":1378.76,"to":1381.7,"location":2,"content":"sort of seems near maybe not that hard"},{"from":1381.7,"to":1384.61,"location":2,"content":"there but you know it it gets worse I"},{"from":1384.61,"to":1387.16,"location":2,"content":"mean this isn't as fun an example but"},{"from":1387.16,"to":1390.4,"location":2,"content":"it's a real example of a sentence from"},{"from":1390.4,"to":1392.65,"location":2,"content":"The Wall Street Journal actually the"},{"from":1392.65,"to":1394.9,"location":2,"content":"board approved as acquisition by Royal"},{"from":1394.9,"to":1397.21,"location":2,"content":"Trust Co Limited of Toronto for twenty"},{"from":1397.21,"to":1399.7,"location":2,"content":"seven twenty seven dollars a share at"},{"from":1399.7,"to":1402.04,"location":2,"content":"its monthly meeting boring sentence but"},{"from":1402.04,"to":1404.44,"location":2,"content":"what is the structure of this sentence"},{"from":1404.44,"to":1406.75,"location":2,"content":"well you know we've got a verb here and"},{"from":1406.75,"to":1410.04,"location":2,"content":"we've got exactly the same subject and"},{"from":1410.04,"to":1415.15,"location":2,"content":"for this noun object coming after it but"},{"from":1415.15,"to":1417.04,"location":2,"content":"then what happens after that well here"},{"from":1417.04,"to":1418.84,"location":2,"content":"we've got a prepositional phrase here"},{"from":1418.84,"to":1420.49,"location":2,"content":"we've got a prepositional phrase"},{"from":1420.49,"to":1421.84,"location":2,"content":"we've just got to see it for"},{"from":1421.84,"to":1425.29,"location":2,"content":"prepositional phrases in a row and so"},{"from":1425.29,"to":1429.64,"location":2,"content":"well what we want to do is say for each"},{"from":1429.64,"to":1431.8,"location":2,"content":"of these prepositional phrases what they"},{"from":1431.8,"to":1434.44,"location":2,"content":"modify and starting off there are only"},{"from":1434.44,"to":1437.59,"location":2,"content":"two choices the verbum announce a de for"},{"from":1437.59,"to":1439.57,"location":2,"content":"but it's going to get more complicated"},{"from":1439.57,"to":1442.06,"location":2,"content":"as we go in because look there's another"},{"from":1442.06,"to":1444.36,"location":2,"content":"noun here and another noun here and"},{"from":1444.36,"to":1448.96,"location":2,"content":"another noun here and so once we start"},{"from":1448.96,"to":1450.31,"location":2,"content":"getting further in there'll be more"},{"from":1450.31,"to":1452.53,"location":2,"content":"possibilities okay so let's see if we"},{"from":1452.53,"to":1456.73,"location":2,"content":"can work it out so by Royal Trust Co"},{"from":1456.73,"to":1463.64,"location":2,"content":"Limited what's that modifying"},{"from":1463.64,"to":1466.08,"location":2,"content":"right it's the acquisition so it's not"},{"from":1466.08,"to":1468.18,"location":2,"content":"the board approved by Royal Trust Co"},{"from":1468.18,"to":1470.88,"location":2,"content":"limited it's an acquisition by Royal"},{"from":1470.88,"to":1473.94,"location":2,"content":"Trust Co Limited okay so this one is a"},{"from":1473.94,"to":1477.72,"location":2,"content":"dependent of the acquisition okay"},{"from":1477.72,"to":1480.45,"location":2,"content":"now we want to do of Toronto and we have"},{"from":1480.45,"to":1482.79,"location":2,"content":"three choices it could be this this or"},{"from":1482.79,"to":1483.72,"location":2,"content":"this"},{"from":1483.72,"to":1491.61,"location":2,"content":"okay so of Toronto is modifying its"},{"from":1491.61,"to":1500.73,"location":2,"content":"acquisition of Toronto is there another"},{"from":1500.73,"to":1505.1,"location":2,"content":"guess but what of Toronto is modifying"},{"from":1505.1,"to":1509.4,"location":2,"content":"its Royal Trust Co Limited of Toronto so"},{"from":1509.4,"to":1513.18,"location":2,"content":"this of Toronto is a dependent of Royal"},{"from":1513.18,"to":1515.61,"location":2,"content":"Trust Co Limited and Royal Trust Co"},{"from":1515.61,"to":1517.86,"location":2,"content":"Limited right that's this again sort of"},{"from":1517.86,"to":1519.66,"location":2,"content":"this noun phrase so it can also have"},{"from":1519.66,"to":1522.3,"location":2,"content":"modifiers by prepositional phrase okay"},{"from":1522.3,"to":1529.05,"location":2,"content":"for $27 a share is modifying acquisition"},{"from":1529.05,"to":1532.2,"location":2,"content":"right so now we leap right back"},{"from":1532.2,"to":1535.2,"location":2,"content":"oops I'm drawing this wrong now we leap"},{"from":1535.2,"to":1538.95,"location":2,"content":"right back and it's now the acquisition"},{"from":1538.95,"to":1541.32,"location":2,"content":"that's being modified and then finally"},{"from":1541.32,"to":1543.92,"location":2,"content":"we have at its monthly meeting is"},{"from":1543.92,"to":1550.65,"location":2,"content":"modifying right it's a prude yeah it's"},{"from":1550.65,"to":1553.68,"location":2,"content":"approved at its monthly meeting okay"},{"from":1553.68,"to":1558.3,"location":2,"content":"oops I drew that one oops I drew that"},{"from":1558.3,"to":1559.59,"location":2,"content":"one the wrong way round with the arrow"},{"from":1559.59,"to":1562.26,"location":2,"content":"sorry it's ribbing down this way I'm"},{"from":1562.26,"to":1568.17,"location":2,"content":"getting my arrows wrong whoops okay so"},{"from":1568.17,"to":1570.81,"location":2,"content":"that we've got this pattern of how"},{"from":1570.81,"to":1576.99,"location":2,"content":"things are modifying and so actually you"},{"from":1576.99,"to":1578.7,"location":2,"content":"know once you start having a lot of"},{"from":1578.7,"to":1581.61,"location":2,"content":"things that have choices like this you"},{"from":1581.61,"to":1584.34,"location":2,"content":"start having if I want to put an"},{"from":1584.34,"to":1587.28,"location":2,"content":"analysis on to this sentence of to work"},{"from":1587.28,"to":1590.61,"location":2,"content":"out the the right structure I have to"},{"from":1590.61,"to":1593.64,"location":2,"content":"potentially consider an exponential"},{"from":1593.64,"to":1594.54,"location":2,"content":"number"},{"from":1594.54,"to":1596.79,"location":2,"content":"possible structures because I've got"},{"from":1596.79,"to":1598.58,"location":2,"content":"this situation where for the first"},{"from":1598.58,"to":1601.11,"location":2,"content":"prepositional phrase there were two"},{"from":1601.11,"to":1603.84,"location":2,"content":"places that could have modified but the"},{"from":1603.84,"to":1605.79,"location":2,"content":"second prepositional phrase there are"},{"from":1605.79,"to":1607.47,"location":2,"content":"three places that could have modified"},{"from":1607.47,"to":1610.29,"location":2,"content":"for the fourth one there are five places"},{"from":1610.29,"to":1611.97,"location":2,"content":"that could have multiplied that just"},{"from":1611.97,"to":1614.43,"location":2,"content":"sounds like a factorial it's not quite"},{"from":1614.43,"to":1617.1,"location":2,"content":"as bad as a factorial because normally"},{"from":1617.1,"to":1619.68,"location":2,"content":"once you've left back that kind of"},{"from":1619.68,"to":1622.38,"location":2,"content":"closes off the ones in the middle and so"},{"from":1622.38,"to":1624.84,"location":2,"content":"further prepositional phrases have to be"},{"from":1624.84,"to":1627.03,"location":2,"content":"at least as far back in terms of what"},{"from":1627.03,"to":1629.43,"location":2,"content":"they modify and so if you get into this"},{"from":1629.43,"to":1631.8,"location":2,"content":"sort of combinatorics stuff the number"},{"from":1631.8,"to":1633.33,"location":2,"content":"of analyses you get when you get"},{"from":1633.33,"to":1635.58,"location":2,"content":"multiple prepositional phrases is this"},{"from":1635.58,"to":1638.28,"location":2,"content":"sequence called the Catalan numbers but"},{"from":1638.28,"to":1640.86,"location":2,"content":"that's still an exponential series and"},{"from":1640.86,"to":1643.35,"location":2,"content":"it's sort of one that turns up in a lot"},{"from":1643.35,"to":1645.15,"location":2,"content":"of places when they're a tree-like"},{"from":1645.15,"to":1648.9,"location":2,"content":"context so if any of you are doing or"},{"from":1648.9,"to":1651.86,"location":2,"content":"have done CS 228 where you see"},{"from":1651.86,"to":1655.25,"location":2,"content":"triangular triangulation of"},{"from":1655.25,"to":1657.69,"location":2,"content":"probabilistic graphical models and you"},{"from":1657.69,"to":1659.79,"location":2,"content":"ask how many triangulations there are"},{"from":1659.79,"to":1662.01,"location":2,"content":"that's sort of like making a tree over"},{"from":1662.01,"to":1664.89,"location":2,"content":"your variables and that again gives you"},{"from":1664.89,"to":1667.17,"location":2,"content":"the number of them as the Catalan series"},{"from":1667.17,"to":1669.84,"location":2,"content":"okay but so the point is we have end up"},{"from":1669.84,"to":1673.08,"location":2,"content":"with a lot of ambiguities okay so that's"},{"from":1673.08,"to":1675.54,"location":2,"content":"prepositional phrase attachments a lot"},{"from":1675.54,"to":1677.73,"location":2,"content":"of those going on they're far from the"},{"from":1677.73,"to":1680.01,"location":2,"content":"only kind of ambiguities I want to tell"},{"from":1680.01,"to":1682.22,"location":2,"content":"you about a few others"},{"from":1682.22,"to":1685.29,"location":2,"content":"okay shuttle veteran and longtime Nasr"},{"from":1685.29,"to":1687.81,"location":2,"content":"executive Fred Gregory appointed to"},{"from":1687.81,"to":1692.91,"location":2,"content":"board why is this sentence ambiguous one"},{"from":1692.91,"to":1701.44,"location":2,"content":"of the different readings of disciplines"},{"from":1701.44,"to":1705.47,"location":2,"content":"okay right answer so yeah there are two"},{"from":1705.47,"to":1708.14,"location":2,"content":"possibilities right that is either that"},{"from":1708.14,"to":1711.23,"location":2,"content":"there's somebody who's a shuttle veteran"},{"from":1711.23,"to":1712.13,"location":2,"content":"and a longtime"},{"from":1712.13,"to":1715.04,"location":2,"content":"Nasr executive and their name is Fred"},{"from":1715.04,"to":1716.93,"location":2,"content":"Gregory and that they've been appointed"},{"from":1716.93,"to":1722.6,"location":2,"content":"to the board or the other possibility is"},{"from":1722.6,"to":1724.22,"location":2,"content":"that there's a shuttle veteran and"},{"from":1724.22,"to":1726.5,"location":2,"content":"there's a long time nasser executive"},{"from":1726.5,"to":1729.47,"location":2,"content":"fred gregory and both of them have been"},{"from":1729.47,"to":1732.14,"location":2,"content":"appointed to the board and so again we"},{"from":1732.14,"to":1735.08,"location":2,"content":"can start to indicate the structure of"},{"from":1735.08,"to":1737.51,"location":2,"content":"that using our dependencies so we can"},{"from":1737.51,"to":1743.05,"location":2,"content":"either say okay"},{"from":1743.05,"to":1747.08,"location":2,"content":"there's Fred Gregory and then this"},{"from":1747.08,"to":1753.8,"location":2,"content":"person is a shuttle veteran and longtime"},{"from":1753.8,"to":1754.91,"location":2,"content":"antep's"},{"from":1754.91,"to":1759.05,"location":2,"content":"and longtime nester executive or we can"},{"from":1759.05,"to":1762.74,"location":2,"content":"say well we're doing appointment of a"},{"from":1762.74,"to":1766.97,"location":2,"content":"veteran and the longtime there's for"},{"from":1766.97,"to":1769.28,"location":2,"content":"executive reg reg ray and so we can"},{"from":1769.28,"to":1772.04,"location":2,"content":"represent by dependencies these two"},{"from":1772.04,"to":1777.55,"location":2,"content":"different structures okay that's one"},{"from":1777.55,"to":1780.86,"location":2,"content":"that one's not very funny again so so"},{"from":1780.86,"to":1782.57,"location":2,"content":"here's a funnier example that"},{"from":1782.57,"to":1784.55,"location":2,"content":"illustrates the same ambiguity"},{"from":1784.55,"to":1786.02,"location":2,"content":"effectively"},{"from":1786.02,"to":1788.92,"location":2,"content":"so here's president's first physical"},{"from":1788.92,"to":1793.85,"location":2,"content":"doctor no heart cognitive issues so"},{"from":1793.85,"to":1797.47,"location":2,"content":"there isn't actually an explicit"},{"from":1797.47,"to":1800.9,"location":2,"content":"coordination word here but effectively"},{"from":1800.9,"to":1804.86,"location":2,"content":"in in natural language or certainly"},{"from":1804.86,"to":1808.61,"location":2,"content":"English you can use kind of just comma"},{"from":1808.61,"to":1811.15,"location":2,"content":"of sort of list intonation to"},{"from":1811.15,"to":1814.55,"location":2,"content":"effectively act as if it was an and or"},{"from":1814.55,"to":1819.17,"location":2,"content":"an or right so here we have again two"},{"from":1819.17,"to":1823.13,"location":2,"content":"possibilities that either we have issues"},{"from":1823.13,"to":1825.71,"location":2,"content":"and the dependent"},{"from":1825.71,"to":1828.29,"location":2,"content":"the dependencies of the dependencies of"},{"from":1828.29,"to":1831.35,"location":2,"content":"issues is that there are no issues so"},{"from":1831.35,"to":1834.56,"location":2,"content":"that's actually a determine are no"},{"from":1834.56,"to":1837.92,"location":2,"content":"issues and then it's sort of like no"},{"from":1837.92,"to":1841.31,"location":2,"content":"heart or cognitive issues so half is"},{"from":1841.31,"to":1843.44,"location":2,"content":"another dependent it's sort of a noun"},{"from":1843.44,"to":1846.92,"location":2,"content":"compound heart issues and so we refer to"},{"from":1846.92,"to":1850.01,"location":2,"content":"as an independancy and then it's heart"},{"from":1850.01,"to":1856.37,"location":2,"content":"or cognitive so that hard or cognitive"},{"from":1856.37,"to":1860.9,"location":2,"content":"is a conjoined phrase inside of this no"},{"from":1860.9,"to":1863.87,"location":2,"content":"heart or cognitive issues but there's"},{"from":1863.87,"to":1867.56,"location":2,"content":"another possibility which is that the"},{"from":1867.56,"to":1870.11,"location":2,"content":"coordination is at the top level that we"},{"from":1870.11,"to":1874.9,"location":2,"content":"have no heart and cognitive issues and"},{"from":1874.9,"to":1878.42,"location":2,"content":"at that point we've have the cognitive"},{"from":1878.42,"to":1881.93,"location":2,"content":"as an adjective modifier the issues and"},{"from":1881.93,"to":1884.9,"location":2,"content":"the know heart the determiner is just a"},{"from":1884.9,"to":1887.87,"location":2,"content":"modifier of heart and then these are"},{"from":1887.87,"to":1891.41,"location":2,"content":"being conjoined together so hard hazard"},{"from":1891.41,"to":1897.34,"location":2,"content":"as a coordinated dependency of issues"},{"from":1897.34,"to":1902.69,"location":2,"content":"okay that's one one I've got more funny"},{"from":1902.69,"to":1903.2,"location":2,"content":"ones"},{"from":1903.2,"to":1909.2,"location":2,"content":"if student gets hmm okay so what the"},{"from":1909.2,"to":1912.44,"location":2,"content":"person who wrote this intended to have"},{"from":1912.44,"to":1915.41,"location":2,"content":"is that there we here we've got an"},{"from":1915.41,"to":1917.72,"location":2,"content":"adjective modifying ambiguity so the"},{"from":1917.72,"to":1922.1,"location":2,"content":"intended reading was that first is an"},{"from":1922.1,"to":1925.91,"location":2,"content":"adjective or modifier of first hand and"},{"from":1925.91,"to":1928.57,"location":2,"content":"its first hand experience"},{"from":1928.57,"to":1932.51,"location":2,"content":"so the first hand is a modifier of"},{"from":1932.51,"to":1936.2,"location":2,"content":"experience and the job is also a"},{"from":1936.2,"to":1938.99,"location":2,"content":"modifier of experience and then we have"},{"from":1938.99,"to":1945.23,"location":2,"content":"the same kind of subject object reading"},{"from":1945.23,"to":1950,"location":2,"content":"on that one but unfortunately this"},{"from":1950,"to":1954.77,"location":2,"content":"sentence has a different reading where"},{"from":1954.77,"to":1956.48,"location":2,"content":"you change the modification"},{"from":1956.48,"to":1958.25,"location":2,"content":"relationships"},{"from":1958.25,"to":1960.95,"location":2,"content":"and you have it's the first experience"},{"from":1960.95,"to":1969.11,"location":2,"content":"and it goes like this okay one more"},{"from":1969.11,"to":1973.52,"location":2,"content":"example mutilated body washes up on Rio"},{"from":1973.52,"to":1975.92,"location":2,"content":"beach to be used for Olympics beach"},{"from":1975.92,"to":1982.46,"location":2,"content":"volleyball what water what are the two"},{"from":1982.46,"to":1984.62,"location":2,"content":"and bigger one of the two readings that"},{"from":1984.62,"to":1992.48,"location":2,"content":"you can get to this one okay so we've"},{"from":1992.48,"to":1994.91,"location":2,"content":"got this big phrase that I won't and try"},{"from":1994.91,"to":1998,"location":2,"content":"and put a structure over to be used for"},{"from":1998,"to":2001.9,"location":2,"content":"Olympic beach volleyball and then you"},{"from":2001.9,"to":2003.37,"location":2,"content":"know this is sort of like a"},{"from":2003.37,"to":2005.02,"location":2,"content":"prepositional phrase attachment"},{"from":2005.02,"to":2007.87,"location":2,"content":"ambiguity but it's this time instead of"},{"from":2007.87,"to":2009.94,"location":2,"content":"it's a prepositional phrase that's being"},{"from":2009.94,"to":2012.25,"location":2,"content":"attached we've now got this big verb"},{"from":2012.25,"to":2014.47,"location":2,"content":"phrase we call it right so that when"},{"from":2014.47,"to":2016.36,"location":2,"content":"you've sort of got most of a sentence"},{"from":2016.36,"to":2018.07,"location":2,"content":"but without any subject to it that's"},{"from":2018.07,"to":2020.29,"location":2,"content":"sort of a verb phrase to be used for"},{"from":2020.29,"to":2022.54,"location":2,"content":"Olympic beach volleyball which might be"},{"from":2022.54,"to":2024.97,"location":2,"content":"then finit of form sometimes it's in"},{"from":2024.97,"to":2027.31,"location":2,"content":"participial form like being used for"},{"from":2027.31,"to":2030.46,"location":2,"content":"beach volleyball and really those kind"},{"from":2030.46,"to":2033.3,"location":2,"content":"of verb phrases are sort of just like"},{"from":2033.3,"to":2035.53,"location":2,"content":"prepositional phrases whenever they"},{"from":2035.53,"to":2037.12,"location":2,"content":"appear towards the right end of"},{"from":2037.12,"to":2039.88,"location":2,"content":"sentences they can modify various things"},{"from":2039.88,"to":2044.92,"location":2,"content":"like verbs or nouns so here we have two"},{"from":2044.92,"to":2047.14,"location":2,"content":"possibilities so this to be used for"},{"from":2047.14,"to":2049.84,"location":2,"content":"Olympics beach volleyball what the right"},{"from":2049.84,"to":2052.45,"location":2,"content":"answer is meant to be is that that is a"},{"from":2052.45,"to":2055.41,"location":2,"content":"dependent of the Rio beach so it's a"},{"from":2055.41,"to":2059.26,"location":2,"content":"modifier of the Rio beach but the funny"},{"from":2059.26,"to":2063.85,"location":2,"content":"reading is that instead of that we can"},{"from":2063.85,"to":2066.21,"location":2,"content":"have here's another noun phrase muta"},{"from":2066.21,"to":2069.55,"location":2,"content":"mutilated body and it's the mutilated"},{"from":2069.55,"to":2073.87,"location":2,"content":"body that's going to be used and so then"},{"from":2073.87,"to":2078.3,"location":2,"content":"this would be a noun phrase modifier"},{"from":2078.3,"to":2082.98,"location":2,"content":"love that okay so knowing the right"},{"from":2082.98,"to":2086.44,"location":2,"content":"structure of sentences is important to"},{"from":2086.44,"to":2088.09,"location":2,"content":"understand the interpretations you're"},{"from":2088.09,"to":2090.07,"location":2,"content":"meant to get and the interpretations"},{"from":2090.07,"to":2091.63,"location":2,"content":"you're not meant to get"},{"from":2091.63,"to":2095.66,"location":2,"content":"okay but it's it's sort of okay you know"},{"from":2095.66,"to":2097.58,"location":2,"content":"us using funny examples for the obvious"},{"from":2097.58,"to":2099.47,"location":2,"content":"reason but you know this is sort of"},{"from":2099.47,"to":2101.81,"location":2,"content":"essential to all the things that we'd"},{"from":2101.81,"to":2104.6,"location":2,"content":"like to get out of language most of the"},{"from":2104.6,"to":2106.76,"location":2,"content":"time so you know this is back to the"},{"from":2106.76,"to":2108.77,"location":2,"content":"kind of boring stuff that we often work"},{"from":2108.77,"to":2110.93,"location":2,"content":"with are reading through biomedical"},{"from":2110.93,"to":2113.6,"location":2,"content":"research articles and trying to extract"},{"from":2113.6,"to":2116.15,"location":2,"content":"facts about protein-protein interactions"},{"from":2116.15,"to":2118.43,"location":2,"content":"from them or something like that so you"},{"from":2118.43,"to":2121.01,"location":2,"content":"know this is the results demonstrated"},{"from":2121.01,"to":2124.04,"location":2,"content":"that Chi C interacts rhythmically with"},{"from":2124.04,"to":2131.03,"location":2,"content":"sass a cut Chi a and Chi B and well"},{"from":2131.03,"to":2136.85,"location":2,"content":"whoops turn those notifications off so"},{"from":2136.85,"to":2138.65,"location":2,"content":"if we want to get out sort of"},{"from":2138.65,"to":2141.56,"location":2,"content":"protein-protein interaction facts you"},{"from":2141.56,"to":2143.87,"location":2,"content":"know well we have this Chi see there's"},{"from":2143.87,"to":2146.03,"location":2,"content":"interacting with these other proteins"},{"from":2146.03,"to":2149.12,"location":2,"content":"over there and well the way we can do"},{"from":2149.12,"to":2151.73,"location":2,"content":"that is looking at patterns and our"},{"from":2151.73,"to":2155.21,"location":2,"content":"dependency analysis and so that we can"},{"from":2155.21,"to":2158.51,"location":2,"content":"sort of see this repeated pattern where"},{"from":2158.51,"to":2164.68,"location":2,"content":"you have the noun subject here interacts"},{"from":2164.68,"to":2169.28,"location":2,"content":"with and now modifier and then it's"},{"from":2169.28,"to":2170.99,"location":2,"content":"going to be these things that are"},{"from":2170.99,"to":2173.24,"location":2,"content":"beneath that of the sass a and it's"},{"from":2173.24,"to":2176.45,"location":2,"content":"conjoined things Calle and Chi B are the"},{"from":2176.45,"to":2178.43,"location":2,"content":"things that interacts with so we can"},{"from":2178.43,"to":2181.52,"location":2,"content":"kind of think of these two things as"},{"from":2181.52,"to":2185.51,"location":2,"content":"essentially patterns oops I actually"},{"from":2185.51,"to":2188.06,"location":2,"content":"miss editor this sorry this should also"},{"from":2188.06,"to":2193.46,"location":2,"content":"be involved with whoops we can kind of"},{"from":2193.46,"to":2195.2,"location":2,"content":"think of these two things as sort of"},{"from":2195.2,"to":2197.57,"location":2,"content":"patterns and dependencies that we could"},{"from":2197.57,"to":2201.44,"location":2,"content":"look for to find examples of just"},{"from":2201.44,"to":2203.84,"location":2,"content":"protein protein interactions that appear"},{"from":2203.84,"to":2209.69,"location":2,"content":"in biomedical texts okay so that's the"},{"from":2209.69,"to":2211.88,"location":2,"content":"general idea of what we want to do and"},{"from":2211.88,"to":2213.92,"location":2,"content":"so the tool we want to do it with is"},{"from":2213.92,"to":2216.35,"location":2,"content":"these dependency grammars and so I've"},{"from":2216.35,"to":2218.45,"location":2,"content":"sort of shown you some dependency"},{"from":2218.45,"to":2221.15,"location":2,"content":"grammars I just want to sort of motivate"},{"from":2221.15,"to":2224.09,"location":2,"content":"dependency grammar a bit more formally"},{"from":2224.09,"to":2225.33,"location":2,"content":"and"},{"from":2225.33,"to":2229.2,"location":2,"content":"right so dependency grammar postulates"},{"from":2229.2,"to":2232.08,"location":2,"content":"of what a syntactic structure is is that"},{"from":2232.08,"to":2235.2,"location":2,"content":"you have relations between lexical items"},{"from":2235.2,"to":2237.03,"location":2,"content":"that are sort of binary asymmetric"},{"from":2237.03,"to":2239.34,"location":2,"content":"relations which we draw as arrows"},{"from":2239.34,"to":2241.26,"location":2,"content":"because they're binary and asymmetric"},{"from":2241.26,"to":2244.23,"location":2,"content":"and we call dependencies and there's"},{"from":2244.23,"to":2246.63,"location":2,"content":"sort of two ways common ways of writing"},{"from":2246.63,"to":2249.54,"location":2,"content":"them and I've sort of shown both now one"},{"from":2249.54,"to":2251.31,"location":2,"content":"way is you sort of put the words in a"},{"from":2251.31,"to":2254.73,"location":2,"content":"line and that makes it easier to see the"},{"from":2254.73,"to":2256.41,"location":2,"content":"whole sentence and you draw those sort"},{"from":2256.41,"to":2258.69,"location":2,"content":"of loopy arrows above them and the other"},{"from":2258.69,"to":2260.91,"location":2,"content":"way is you sort of more represented as a"},{"from":2260.91,"to":2263.13,"location":2,"content":"tree where you put the head of the whole"},{"from":2263.13,"to":2265.8,"location":2,"content":"sentence at the top submitted and then"},{"from":2265.8,"to":2269.19,"location":2,"content":"you say the dependents of submitted our"},{"from":2269.19,"to":2271.41,"location":2,"content":"bills were and brown back and then you"},{"from":2271.41,"to":2275.24,"location":2,"content":"say the dependents of each of those so"},{"from":2275.24,"to":2278.4,"location":2,"content":"it was bills on ports and immigration so"},{"from":2278.4,"to":2280.8,"location":2,"content":"they're dependents of bills and were"},{"from":2280.8,"to":2282.36,"location":2,"content":"submitted whereas at the pin of"},{"from":2282.36,"to":2284.1,"location":2,"content":"submitted and you're giving this kind of"},{"from":2284.1,"to":2289.23,"location":2,"content":"tree structure okay so in addition to"},{"from":2289.23,"to":2292.95,"location":2,"content":"the arrows commonly what we do is we put"},{"from":2292.95,"to":2296.34,"location":2,"content":"and type on each arrow which says what"},{"from":2296.34,"to":2298.23,"location":2,"content":"grammatical relations holding them"},{"from":2298.23,"to":2300.69,"location":2,"content":"between them so is this the subject of"},{"from":2300.69,"to":2302.76,"location":2,"content":"the sentence is that the object of the"},{"from":2302.76,"to":2306.87,"location":2,"content":"verb is it a conjunct and things like"},{"from":2306.87,"to":2309.45,"location":2,"content":"that we have a system of dependency"},{"from":2309.45,"to":2313.23,"location":2,"content":"labels so for the assignment what we're"},{"from":2313.23,"to":2316.05,"location":2,"content":"going to do is use Universal"},{"from":2316.05,"to":2318.09,"location":2,"content":"dependencies which I'll show you more a"},{"from":2318.09,"to":2320.4,"location":2,"content":"little bit more of in a minute and if"},{"from":2320.4,"to":2322.77,"location":2,"content":"you think man this stuff is fascinating"},{"from":2322.77,"to":2324.36,"location":2,"content":"I want to learn all about these linguist"},{"from":2324.36,"to":2326.31,"location":2,"content":"structures there's a universal"},{"from":2326.31,"to":2328.89,"location":2,"content":"dependency side that you go can go off"},{"from":2328.89,"to":2331.32,"location":2,"content":"and look at and learn all about them but"},{"from":2331.32,"to":2333.48,"location":2,"content":"if you don't think that's fascinating"},{"from":2333.48,"to":2335.82,"location":2,"content":"for what we're doing for this class"},{"from":2335.82,"to":2338.01,"location":2,"content":"we're never going to make use of these"},{"from":2338.01,"to":2341.22,"location":2,"content":"labels all we're doing is making use of"},{"from":2341.22,"to":2344.64,"location":2,"content":"the arrows and for the arrows you should"},{"from":2344.64,"to":2346.05,"location":2,"content":"be able to interpret things like"},{"from":2346.05,"to":2348.54,"location":2,"content":"prepositional phrases as to what they're"},{"from":2348.54,"to":2351,"location":2,"content":"modifying just in terms of where the"},{"from":2351,"to":2352.98,"location":2,"content":"prepositional phrases are connected and"},{"from":2352.98,"to":2356.94,"location":2,"content":"whether that's right or wrong okay yes"},{"from":2356.94,"to":2358.56,"location":2,"content":"so formally when we"},{"from":2358.56,"to":2360.93,"location":2,"content":"this kind of dependency grammar we've"},{"from":2360.93,"to":2363.72,"location":2,"content":"sort of drawing these arrows and we sort"},{"from":2363.72,"to":2366.12,"location":2,"content":"of refer to the thing at this end as the"},{"from":2366.12,"to":2369.03,"location":2,"content":"head of a dependency and the thing at"},{"from":2369.03,"to":2371.88,"location":2,"content":"this end as the dependent of the"},{"from":2371.88,"to":2375,"location":2,"content":"dependency and as in these examples our"},{"from":2375,"to":2377.97,"location":2,"content":"normal expectation and what our parses"},{"from":2377.97,"to":2380.19,"location":2,"content":"are going to do is the dependencies form"},{"from":2380.19,"to":2383.22,"location":2,"content":"a tree so it's a connected acyclic"},{"from":2383.22,"to":2386.49,"location":2,"content":"single rooted graph at the end of the"},{"from":2386.49,"to":2387.5,"location":2,"content":"day"},{"from":2387.5,"to":2390.81,"location":2,"content":"okay so dependency grammar has an"},{"from":2390.81,"to":2393.99,"location":2,"content":"enormous lee long history so basically"},{"from":2393.99,"to":2396.72,"location":2,"content":"the famous first linguist that human"},{"from":2396.72,"to":2399.93,"location":2,"content":"beings know about is parney who wrote in"},{"from":2399.93,"to":2401.73,"location":2,"content":"the 5th century before the Common Era"},{"from":2401.73,"to":2404.43,"location":2,"content":"and tried to describe the structure of"},{"from":2404.43,"to":2407.43,"location":2,"content":"sanskrit and a lot of what parney did"},{"from":2407.43,"to":2409.71,"location":2,"content":"was working out things about all of the"},{"from":2409.71,"to":2411.66,"location":2,"content":"morphology of Sanskrit that I'm not"},{"from":2411.66,"to":2414.6,"location":2,"content":"going to touch at the moment but beyond"},{"from":2414.6,"to":2416.43,"location":2,"content":"that he started trying to describe this"},{"from":2416.43,"to":2419.36,"location":2,"content":"the structure of Sanskrit sentences and"},{"from":2419.36,"to":2422.19,"location":2,"content":"the notation was sort of different but"},{"from":2422.19,"to":2424.8,"location":2,"content":"essentially the mechanism he used for"},{"from":2424.8,"to":2427.26,"location":2,"content":"describing the structure of Sanskrit was"},{"from":2427.26,"to":2429.21,"location":2,"content":"dependencies of sort of working out"},{"from":2429.21,"to":2432.42,"location":2,"content":"these what our arguments and modifiers"},{"from":2432.42,"to":2434.49,"location":2,"content":"of what relationships like we've been"},{"from":2434.49,"to":2437.58,"location":2,"content":"looking at and indeed if you look at"},{"from":2437.58,"to":2440.3,"location":2,"content":"kind of the history of humankind"},{"from":2440.3,"to":2443.82,"location":2,"content":"most of attempts to understand the"},{"from":2443.82,"to":2445.53,"location":2,"content":"structure of human languages are"},{"from":2445.53,"to":2450.39,"location":2,"content":"essentially dependency grammars so sort"},{"from":2450.39,"to":2452.1,"location":2,"content":"of in the later parts of the first"},{"from":2452.1,"to":2453.96,"location":2,"content":"millennium there was a ton of work by"},{"from":2453.96,"to":2456.6,"location":2,"content":"Arabic grammarians and essentially what"},{"from":2456.6,"to":2459.27,"location":2,"content":"they used is also kind of basically a"},{"from":2459.27,"to":2462.6,"location":2,"content":"dependency grammar so compared to that"},{"from":2462.6,"to":2464.94,"location":2,"content":"you know the idea of context-free"},{"from":2464.94,"to":2467.01,"location":2,"content":"grammars and phrase structure grammar z'"},{"from":2467.01,"to":2470.46,"location":2,"content":"is incredibly incredibly new I mean you"},{"from":2470.46,"to":2472.65,"location":2,"content":"can basically totally date at there was"},{"from":2472.65,"to":2476.01,"location":2,"content":"this guy Wells in 1947 who first"},{"from":2476.01,"to":2477.98,"location":2,"content":"proposed this idea of having these"},{"from":2477.98,"to":2480.09,"location":2,"content":"constituents and phrase structure"},{"from":2480.09,"to":2482.01,"location":2,"content":"grammar x' and where it then became"},{"from":2482.01,"to":2484.02,"location":2,"content":"really famous is through the work of"},{"from":2484.02,"to":2487.41,"location":2,"content":"Chomsky which love him or hate him is by"},{"from":2487.41,"to":2491.31,"location":2,"content":"far the most famous linguist and also"},{"from":2491.31,"to":2493.08,"location":2,"content":"variously contributed to computer"},{"from":2493.08,"to":2495.12,"location":2,"content":"science who's heard of the Chomsky"},{"from":2495.12,"to":2497.25,"location":2,"content":"hierarchy do people remember that 103"},{"from":2497.25,"to":2500.31,"location":2,"content":"yeah okay the Chomsky hierarchy the"},{"from":2500.31,"to":2503.13,"location":2,"content":"Chomsky hierarchy was not invented to"},{"from":2503.13,"to":2505.53,"location":2,"content":"torture beginning computer science"},{"from":2505.53,"to":2508.44,"location":2,"content":"students the Chomsky hierarchy was"},{"from":2508.44,"to":2511.14,"location":2,"content":"invented because Chomsky wanted to make"},{"from":2511.14,"to":2514.05,"location":2,"content":"arguments as to what the complexity of"},{"from":2514.05,"to":2518.91,"location":2,"content":"human languages was okay yeah so in"},{"from":2518.91,"to":2520.5,"location":2,"content":"modern work"},{"from":2520.5,"to":2524.4,"location":2,"content":"there's this guy Lucien ten year and he"},{"from":2524.4,"to":2527.01,"location":2,"content":"sort of formalized the kind of version"},{"from":2527.01,"to":2528.66,"location":2,"content":"of dependency grammar that I've been"},{"from":2528.66,"to":2532.26,"location":2,"content":"showing you and so we sort of often talk"},{"from":2532.26,"to":2535.08,"location":2,"content":"about his work and you know it's it's"},{"from":2535.08,"to":2536.88,"location":2,"content":"long term being influential and"},{"from":2536.88,"to":2538.56,"location":2,"content":"computational linguistics some of the"},{"from":2538.56,"to":2540.51,"location":2,"content":"earliest parsing work in us"},{"from":2540.51,"to":2542.73,"location":2,"content":"computational linguistics was dependency"},{"from":2542.73,"to":2544.82,"location":2,"content":"grammars but I won't go on about that"},{"from":2544.82,"to":2550.88,"location":2,"content":"more now okay just one two little things"},{"from":2550.88,"to":2554.22,"location":2,"content":"to note I mean if you somehow start"},{"from":2554.22,"to":2555.99,"location":2,"content":"looking at other papers where there are"},{"from":2555.99,"to":2558.36,"location":2,"content":"dependency grammars people aren't"},{"from":2558.36,"to":2561,"location":2,"content":"consistent on which way to have the"},{"from":2561,"to":2563.73,"location":2,"content":"arrows point there are sort of two ways"},{"from":2563.73,"to":2566.7,"location":2,"content":"of thinking about this that you can"},{"from":2566.7,"to":2570.09,"location":2,"content":"either think okay I'm going to start at"},{"from":2570.09,"to":2573.69,"location":2,"content":"the head and point to the dependent or"},{"from":2573.69,"to":2575.34,"location":2,"content":"you can say I'm going to start at the"},{"from":2575.34,"to":2577.77,"location":2,"content":"dependent and say what his head is and"},{"from":2577.77,"to":2580.14,"location":2,"content":"you find both of them the way we're"},{"from":2580.14,"to":2582.6,"location":2,"content":"going to do it in this class is to do it"},{"from":2582.6,"to":2585,"location":2,"content":"the way ten year did it which was you"},{"from":2585,"to":2586.95,"location":2,"content":"start at the head and pointed to the"},{"from":2586.95,"to":2589.95,"location":2,"content":"dependent well sorry"},{"from":2589.95,"to":2593.07,"location":2,"content":"I'm drawing that wrong whoops because"},{"from":2593.07,"to":2595.02,"location":2,"content":"discussion of the outstanding issue so"},{"from":2595.02,"to":2599.07,"location":2,"content":"really the dependent of is discussion"},{"from":2599.07,"to":2601.74,"location":2,"content":"okay but we go from heads to dependents"},{"from":2601.74,"to":2605.4,"location":2,"content":"and usually it's convenient to serve in"},{"from":2605.4,"to":2607.35,"location":2,"content":"addition to the sentence to sort of have"},{"from":2607.35,"to":2610.53,"location":2,"content":"a fake root node that points to the head"},{"from":2610.53,"to":2612.36,"location":2,"content":"of the whole sentence and so we use that"},{"from":2612.36,"to":2620.46,"location":2,"content":"as well okay so to build dependency"},{"from":2620.46,"to":2624.11,"location":2,"content":"parsers or to indeed build any kind of"},{"from":2624.11,"to":2628.23,"location":2,"content":"human language structure finders"},{"from":2628.23,"to":2630.51,"location":2,"content":"including kind of constituency grammar"},{"from":2630.51,"to":2635.01,"location":2,"content":"parsers the central tool in recent work"},{"from":2635.01,"to":2638.54,"location":2,"content":"where recent work kind of means the last"},{"from":2638.54,"to":2642.12,"location":2,"content":"25 years has been this idea of tree"},{"from":2642.12,"to":2646.23,"location":2,"content":"banks and the idea of tree banks is to"},{"from":2646.23,"to":2649.89,"location":2,"content":"say we are going to get human beings to"},{"from":2649.89,"to":2653.76,"location":2,"content":"sit around and put grammatical structure"},{"from":2653.76,"to":2656.37,"location":2,"content":"over sentences and so here are some"},{"from":2656.37,"to":2658.26,"location":2,"content":"examples I'm showing you from universal"},{"from":2658.26,"to":2661.26,"location":2,"content":"dependencies where here are some English"},{"from":2661.26,"to":2663.54,"location":2,"content":"sentences I think miramar was the famous"},{"from":2663.54,"to":2666.48,"location":2,"content":"goat trainer or something and some human"},{"from":2666.48,"to":2669.24,"location":2,"content":"being has sat and put a dependency"},{"from":2669.24,"to":2671.07,"location":2,"content":"structure over this sentence and all the"},{"from":2671.07,"to":2674.13,"location":2,"content":"rest and with the name Universal"},{"from":2674.13,"to":2676.31,"location":2,"content":"dependencies this is just an aside"},{"from":2676.31,"to":2678.3,"location":2,"content":"Universal dependencies is actually a"},{"from":2678.3,"to":2680.31,"location":2,"content":"project I've been strongly involved with"},{"from":2680.31,"to":2682.65,"location":2,"content":"but precisely what the goal of universal"},{"from":2682.65,"to":2685.41,"location":2,"content":"dependencies was is to say what we'd"},{"from":2685.41,"to":2689.25,"location":2,"content":"like to do is have a uniform parallel"},{"from":2689.25,"to":2692.46,"location":2,"content":"system of dependency description which"},{"from":2692.46,"to":2694.98,"location":2,"content":"could be used for any human language so"},{"from":2694.98,"to":2697.56,"location":2,"content":"if you go to the universal dependencies"},{"from":2697.56,"to":2700.59,"location":2,"content":"website it's not only about English you"},{"from":2700.59,"to":2703.47,"location":2,"content":"can find Universal dependency analyses"},{"from":2703.47,"to":2707.34,"location":2,"content":"of no French or German or Finnish or"},{"from":2707.34,"to":2710.7,"location":2,"content":"Kasich or Indonesia and lots of"},{"from":2710.7,"to":2713.34,"location":2,"content":"languages of course there are even more"},{"from":2713.34,"to":2715.29,"location":2,"content":"languages which there aren't Universal"},{"from":2715.29,"to":2717.36,"location":2,"content":"dependencies analyses of so if you have"},{"from":2717.36,"to":2720.06,"location":2,"content":"a big calling to say I'm going to build"},{"from":2720.06,"to":2723.3,"location":2,"content":"a Swahili Universal dependencies tree"},{"from":2723.3,"to":2727.08,"location":2,"content":"bank you can get in touch there anyway"},{"from":2727.08,"to":2729.3,"location":2,"content":"so this is the idea of tree banks and"},{"from":2729.3,"to":2734.4,"location":2,"content":"you know historically tree banks wasn't"},{"from":2734.4,"to":2735.81,"location":2,"content":"something that people thought of"},{"from":2735.81,"to":2738.6,"location":2,"content":"immediately this is such an idea that"},{"from":2738.6,"to":2740.73,"location":2,"content":"took quite a long time to develop right"},{"from":2740.73,"to":2743.67,"location":2,"content":"that people started thinking about"},{"from":2743.67,"to":2746.01,"location":2,"content":"grammars of languages even in modern"},{"from":2746.01,"to":2749.46,"location":2,"content":"times in the 50s and people started"},{"from":2749.46,"to":2750.95,"location":2,"content":"building"},{"from":2750.95,"to":2756.05,"location":2,"content":"four languages in the 1919 60s and so"},{"from":2756.05,"to":2759.68,"location":2,"content":"there's decades of work in the 60s 70s"},{"from":2759.68,"to":2763.37,"location":2,"content":"80s and no one had tree banks the way"},{"from":2763.37,"to":2766.4,"location":2,"content":"people did this work is that they wrote"},{"from":2766.4,"to":2768.44,"location":2,"content":"grammars that they either wrote grammars"},{"from":2768.44,"to":2770.57,"location":2,"content":"like the one I did for constituency of"},{"from":2770.57,"to":2773.21,"location":2,"content":"noun phrase goes determiner optional"},{"from":2773.21,"to":2777.17,"location":2,"content":"adjective noun noun goes to goat or the"},{"from":2777.17,"to":2779.84,"location":2,"content":"equivalent kind of grammars in a"},{"from":2779.84,"to":2782.78,"location":2,"content":"dependency format and they hand-built"},{"from":2782.78,"to":2786.86,"location":2,"content":"these grammars and then train had"},{"from":2786.86,"to":2789.2,"location":2,"content":"parsers that could parse these sentences"},{"from":2789.2,"to":2793.49,"location":2,"content":"and going into things having a human"},{"from":2793.49,"to":2794.12,"location":2,"content":"being"},{"from":2794.12,"to":2797.39,"location":2,"content":"writer grammar feels more efficient"},{"from":2797.39,"to":2800.27,"location":2,"content":"because if you write a rule like noun"},{"from":2800.27,"to":2802.4,"location":2,"content":"phrase goes to terminal optional"},{"from":2802.4,"to":2805.01,"location":2,"content":"adjective noun I mean that that"},{"from":2805.01,"to":2807.29,"location":2,"content":"describes a huge number of phrases"},{"from":2807.29,"to":2809.23,"location":2,"content":"watching an infinite number of phrases"},{"from":2809.23,"to":2811.64,"location":2,"content":"so that you know this is the structure"},{"from":2811.64,"to":2813.65,"location":2,"content":"of you know the cat the dog a cat our"},{"from":2813.65,"to":2816.08,"location":2,"content":"dog a large dog all those things we saw"},{"from":2816.08,"to":2817.52,"location":2,"content":"at the beginning so it's really"},{"from":2817.52,"to":2819.35,"location":2,"content":"efficient you're capturing lots of stuff"},{"from":2819.35,"to":2823.82,"location":2,"content":"with one rule but it sort of turned out"},{"from":2823.82,"to":2826.58,"location":2,"content":"that in practice that wasn't such a good"},{"from":2826.58,"to":2829.19,"location":2,"content":"idea and it turned out to be much better"},{"from":2829.19,"to":2831.89,"location":2,"content":"to have these kind of tree banks"},{"from":2831.89,"to":2834.41,"location":2,"content":"supporting structures over sentences and"},{"from":2834.41,"to":2836.75,"location":2,"content":"it's sort of a bit more subtle as to why"},{"from":2836.75,"to":2838.88,"location":2,"content":"that is because it sounds like pretty"},{"from":2838.88,"to":2841.55,"location":2,"content":"menial work building tree banks and in"},{"from":2841.55,"to":2845.36,"location":2,"content":"some sense it is but you know it turns"},{"from":2845.36,"to":2848.54,"location":2,"content":"out to be much more useful I mean so one"},{"from":2848.54,"to":2852.14,"location":2,"content":"huge benefit is that tree banks are very"},{"from":2852.14,"to":2854.81,"location":2,"content":"reusable that effectively what there was"},{"from":2854.81,"to":2858.02,"location":2,"content":"in 60s 70s and 80s was that every"},{"from":2858.02,"to":2860.78,"location":2,"content":"different NOP person who said about"},{"from":2860.78,"to":2863.12,"location":2,"content":"building a parser invented their own"},{"from":2863.12,"to":2865.43,"location":2,"content":"notation for grammar rules which got"},{"from":2865.43,"to":2867.62,"location":2,"content":"more and more complex and it was only"},{"from":2867.62,"to":2869.78,"location":2,"content":"used by their parser and nobody else's"},{"from":2869.78,"to":2872.12,"location":2,"content":"parser and so there was no sharing and"},{"from":2872.12,"to":2873.86,"location":2,"content":"reuse of the work that was done by human"},{"from":2873.86,"to":2875.84,"location":2,"content":"beings well once you have a tree bank"},{"from":2875.84,"to":2878.39,"location":2,"content":"it's reusable for all sorts of purposes"},{"from":2878.39,"to":2881.15,"location":2,"content":"that lots of people build parsers from"},{"from":2881.15,"to":2883.13,"location":2,"content":"it but also other people use it as well"},{"from":2883.13,"to":2884.07,"location":2,"content":"like linguist"},{"from":2884.07,"to":2886.2,"location":2,"content":"now often use tree banks to find"},{"from":2886.2,"to":2889.26,"location":2,"content":"examples of different constructions but"},{"from":2889.26,"to":2891.57,"location":2,"content":"beyond that this sort of just became"},{"from":2891.57,"to":2893.76,"location":2,"content":"necessary once we wanted to do machine"},{"from":2893.76,"to":2896.46,"location":2,"content":"learning so if we want to do machine"},{"from":2896.46,"to":2898.77,"location":2,"content":"learning we want to have data that we"},{"from":2898.77,"to":2901.23,"location":2,"content":"can build models on and in particular a"},{"from":2901.23,"to":2903.75,"location":2,"content":"lot of what our machine learning models"},{"from":2903.75,"to":2906.03,"location":2,"content":"exploit is how common are different"},{"from":2906.03,"to":2907.8,"location":2,"content":"structures and so we want to know about"},{"from":2907.8,"to":2909.51,"location":2,"content":"the Communists and the frequency of"},{"from":2909.51,"to":2912.33,"location":2,"content":"things but then tree banks gave us"},{"from":2912.33,"to":2915,"location":2,"content":"another big thing which is well lots of"},{"from":2915,"to":2918.24,"location":2,"content":"sentences are ambiguous and what we want"},{"from":2918.24,"to":2921.87,"location":2,"content":"to do is build models that find the"},{"from":2921.87,"to":2924.93,"location":2,"content":"right structure for sentences and if all"},{"from":2924.93,"to":2927.78,"location":2,"content":"you do is have a grammar you have no way"},{"from":2927.78,"to":2930.15,"location":2,"content":"of telling what is the right structure"},{"from":2930.15,"to":2932.67,"location":2,"content":"for ambiguous sentences or you can do is"},{"from":2932.67,"to":2935.25,"location":2,"content":"say hey that sentence with four"},{"from":2935.25,"to":2937.83,"location":2,"content":"prepositional phrases after it that I"},{"from":2937.83,"to":2939.66,"location":2,"content":"showed you earlier it has 14 different"},{"from":2939.66,"to":2942.72,"location":2,"content":"parcels let me show you all of them but"},{"from":2942.72,"to":2947.55,"location":2,"content":"once you have tree Bank examples you can"},{"from":2947.55,"to":2950.16,"location":2,"content":"say this is the right structure for this"},{"from":2950.16,"to":2953.22,"location":2,"content":"sentence in context and so you should be"},{"from":2953.22,"to":2955.29,"location":2,"content":"building a machine learning model which"},{"from":2955.29,"to":2957.72,"location":2,"content":"will recover that structure and if you"},{"from":2957.72,"to":2962.16,"location":2,"content":"don't that you're wrong okay so that's"},{"from":2962.16,"to":2965.73,"location":2,"content":"tree banks so how are we going to do"},{"from":2965.73,"to":2969.39,"location":2,"content":"build dependency parsers well somehow we"},{"from":2969.39,"to":2972.24,"location":2,"content":"want models that can kind of capture"},{"from":2972.24,"to":2975.39,"location":2,"content":"what's the right path and just think"},{"from":2975.39,"to":2977.64,"location":2,"content":"about abstractly you know there's sort"},{"from":2977.64,"to":2979.08,"location":2,"content":"of different things that we can pay"},{"from":2979.08,"to":2981.93,"location":2,"content":"attention to so one thing that we can"},{"from":2981.93,"to":2984.66,"location":2,"content":"pay attention to is the sort of actual"},{"from":2984.66,"to":2987.63,"location":2,"content":"words right discussion of issues that's"},{"from":2987.63,"to":2991.23,"location":2,"content":"a reasonable thing so it's reasonable to"},{"from":2991.23,"to":2994.52,"location":2,"content":"have issues as a dependent of discussion"},{"from":2994.52,"to":2998.04,"location":2,"content":"where you know discussion of outstanding"},{"from":2998.04,"to":2999.84,"location":2,"content":"that sounds weird so you probably don't"},{"from":2999.84,"to":3002.93,"location":2,"content":"want that dependency there's a question"},{"from":3002.93,"to":3005.3,"location":2,"content":"of how far apart words are most"},{"from":3005.3,"to":3007.34,"location":2,"content":"dependencies a fairly short distance"},{"from":3007.34,"to":3009.35,"location":2,"content":"they're not all of them are and there's"},{"from":3009.35,"to":3011.66,"location":2,"content":"a question of what's in between if"},{"from":3011.66,"to":3013.46,"location":2,"content":"there's a semicolon in between there"},{"from":3013.46,"to":3016,"location":2,"content":"probably isn't a dependency across herb"},{"from":3016,"to":3018.1,"location":2,"content":"and the other issue is sort of how many"},{"from":3018.1,"to":3020.77,"location":2,"content":"arguments two things take so here we"},{"from":3020.77,"to":3023.08,"location":2,"content":"have was completed if you see the word"},{"from":3023.08,"to":3025.75,"location":2,"content":"was completed you sort of expect that"},{"from":3025.75,"to":3027.67,"location":2,"content":"there'll be a subject before it if"},{"from":3027.67,"to":3029.71,"location":2,"content":"something was completed and it'd be"},{"from":3029.71,"to":3032.59,"location":2,"content":"wrong if there wasn't so you expecting"},{"from":3032.59,"to":3035.02,"location":2,"content":"an argument on that side but on the"},{"from":3035.02,"to":3036.82,"location":2,"content":"other side hand it won't have an object"},{"from":3036.82,"to":3040,"location":2,"content":"after it you won't say the discussion"},{"from":3040,"to":3042.79,"location":2,"content":"was completed the goat that's not a good"},{"from":3042.79,"to":3045.64,"location":2,"content":"sentence right so you won't have a an"},{"from":3045.64,"to":3047.71,"location":2,"content":"object after it so there's sort of"},{"from":3047.71,"to":3049.45,"location":2,"content":"information of that sort and we want to"},{"from":3049.45,"to":3051.61,"location":2,"content":"have our dependency parsers be able to"},{"from":3051.61,"to":3057.85,"location":2,"content":"make use of that structure okay so"},{"from":3057.85,"to":3061.03,"location":2,"content":"effectively what we do when we build a"},{"from":3061.03,"to":3064.03,"location":2,"content":"dependency parser is we're going to say"},{"from":3064.03,"to":3067.48,"location":2,"content":"for each word is is going to be the"},{"from":3067.48,"to":3071.08,"location":2,"content":"dependent of some other word or the root"},{"from":3071.08,"to":3074.77,"location":2,"content":"so this give here is actually the head"},{"from":3074.77,"to":3076.69,"location":2,"content":"of the sentence so it's the dependent of"},{"from":3076.69,"to":3080.28,"location":2,"content":"root the torque is a dependent of gear"},{"from":3080.28,"to":3084.94,"location":2,"content":"is a dependent of torque and so for each"},{"from":3084.94,"to":3088.63,"location":2,"content":"word we want to choose what is a"},{"from":3088.63,"to":3091.45,"location":2,"content":"dependent of and we want to do it in"},{"from":3091.45,"to":3093.76,"location":2,"content":"such a way that the dependencies form a"},{"from":3093.76,"to":3096.58,"location":2,"content":"tree so that means it would be a bad"},{"from":3096.58,"to":3100.54,"location":2,"content":"idea if we made a cycle so if we sort of"},{"from":3100.54,"to":3106.44,"location":2,"content":"said bootstrapping was a dependent of"},{"from":3106.44,"to":3111.1,"location":2,"content":"torque but then we had things sort of"},{"from":3111.1,"to":3114.1,"location":2,"content":"move around so this goes to here but"},{"from":3114.1,"to":3116.05,"location":2,"content":"then talks a dependent of that as all in"},{"from":3116.05,"to":3118.18,"location":2,"content":"a cycle that's bad news we don't want an"},{"from":3118.18,"to":3120.94,"location":2,"content":"cycles we want a tree and there's one"},{"from":3120.94,"to":3125.86,"location":2,"content":"final issue which is we don't want"},{"from":3125.86,"to":3128.68,"location":2,"content":"things that is to whether we want to"},{"from":3128.68,"to":3132.07,"location":2,"content":"allow dependencies to cross or not and"},{"from":3132.07,"to":3134.89,"location":2,"content":"this is an example of this so most of"},{"from":3134.89,"to":3138.19,"location":2,"content":"the time dependencies don't cross each"},{"from":3138.19,"to":3142.87,"location":2,"content":"other but sometimes they do and this"},{"from":3142.87,"to":3146.32,"location":2,"content":"example here is actually an instance for"},{"from":3146.32,"to":3149.42,"location":2,"content":"that so I'll give a talk tomorrow"},{"from":3149.42,"to":3153.65,"location":2,"content":"on bootstrapping so we're giving a talk"},{"from":3153.65,"to":3157.07,"location":2,"content":"that's the object and when it's being"},{"from":3157.07,"to":3160.49,"location":2,"content":"given is tomorrow but this talk has a"},{"from":3160.49,"to":3163.61,"location":2,"content":"modifier that's on bootstrapping so we"},{"from":3163.61,"to":3166.58,"location":2,"content":"actually have another dependency here"},{"from":3166.58,"to":3171.23,"location":2,"content":"that crosses that dependency and that's"},{"from":3171.23,"to":3173.15,"location":2,"content":"sort of rare that doesn't happen a ton"},{"from":3173.15,"to":3175.46,"location":2,"content":"in English but it happens sometimes in"},{"from":3175.46,"to":3177.89,"location":2,"content":"some structures like that and so this is"},{"from":3177.89,"to":3181.16,"location":2,"content":"the question of whether what we say is"},{"from":3181.16,"to":3183.83,"location":2,"content":"that as the part of a sentence is"},{"from":3183.83,"to":3185.48,"location":2,"content":"projective if there are no crossing"},{"from":3185.48,"to":3188,"location":2,"content":"dependencies and there's non projective"},{"from":3188,"to":3189.95,"location":2,"content":"if there are crossing dependencies and"},{"from":3189.95,"to":3192.11,"location":2,"content":"most of the time English is projective"},{"from":3192.11,"to":3194,"location":2,"content":"and it's parses of sentences but"},{"from":3194,"to":3196.04,"location":2,"content":"occasionally not and when it's not as"},{"from":3196.04,"to":3197.84,"location":2,"content":"when you kind of have these constituents"},{"from":3197.84,"to":3199.97,"location":2,"content":"that are delayed to the end of the"},{"from":3199.97,"to":3201.71,"location":2,"content":"sentence right you could have said I'll"},{"from":3201.71,"to":3203.6,"location":2,"content":"give a talk on bootstrapping tomorrow"},{"from":3203.6,"to":3206.15,"location":2,"content":"and then it'd be have a projective pars"},{"from":3206.15,"to":3208.34,"location":2,"content":"but if you want to you can kind of delay"},{"from":3208.34,"to":3210.56,"location":2,"content":"that extra modifier and say I'll give a"},{"from":3210.56,"to":3212.66,"location":2,"content":"talk tomorrow on bootstrapping and then"},{"from":3212.66,"to":3218.87,"location":2,"content":"the parse becomes non projective okay so"},{"from":3218.87,"to":3223.21,"location":2,"content":"that's that there are various ways of"},{"from":3223.21,"to":3227.09,"location":2,"content":"doing dependency parsing but basically"},{"from":3227.09,"to":3228.71,"location":2,"content":"what I'm going to tell you about today"},{"from":3228.71,"to":3231.73,"location":2,"content":"is this one called transition based or"},{"from":3231.73,"to":3234.14,"location":2,"content":"deterministic dependency parsing and"},{"from":3234.14,"to":3236.57,"location":2,"content":"this is the one that's just been"},{"from":3236.57,"to":3239.75,"location":2,"content":"enormous ly influential in practical"},{"from":3239.75,"to":3242.12,"location":2,"content":"deployments of parsing so when Google"},{"from":3242.12,"to":3244.61,"location":2,"content":"goes off and parses every web page what"},{"from":3244.61,"to":3246.71,"location":2,"content":"they're using is a transition based"},{"from":3246.71,"to":3249.98,"location":2,"content":"parser and so this was a notion of"},{"from":3249.98,"to":3253.61,"location":2,"content":"parsing that was mainly popularized by"},{"from":3253.61,"to":3256.52,"location":2,"content":"this guy Wacom Neve ray is a Swedish"},{"from":3256.52,"to":3260.69,"location":2,"content":"computational linguist and what you do"},{"from":3260.69,"to":3263.84,"location":2,"content":"is it's sort of inspired by shift reduce"},{"from":3263.84,"to":3267.53,"location":2,"content":"parsing so probably in you know see s103"},{"from":3267.53,"to":3270.14,"location":2,"content":"your compilers class or something you"},{"from":3270.14,"to":3272.15,"location":2,"content":"saw a little bit of shift reduce parsing"},{"from":3272.15,"to":3274.76,"location":2,"content":"and this is sort of like a shift reduce"},{"from":3274.76,"to":3278.66,"location":2,"content":"parser apart from when we reduce we"},{"from":3278.66,"to":3281.42,"location":2,"content":"build dependencies instead of"},{"from":3281.42,"to":3282.86,"location":2,"content":"constituents"},{"from":3282.86,"to":3285.11,"location":2,"content":"and this has a lot of very technical"},{"from":3285.11,"to":3287.21,"location":2,"content":"description that doesn't help you at all"},{"from":3287.21,"to":3288.83,"location":2,"content":"to look at in terms of understanding"},{"from":3288.83,"to":3293.54,"location":2,"content":"what shift reduce parser does and here's"},{"from":3293.54,"to":3296.45,"location":2,"content":"a formal description of a transition"},{"from":3296.45,"to":3297.95,"location":2,"content":"based shift reduce parsing which also"},{"from":3297.95,"to":3300.92,"location":2,"content":"doesn't help you at all so instead we"},{"from":3300.92,"to":3303.35,"location":2,"content":"kind of look at this example because"},{"from":3303.35,"to":3305.87,"location":2,"content":"that will hopefully help you so what I"},{"from":3305.87,"to":3309.17,"location":2,"content":"want to do is parse the sentence I ate"},{"from":3309.17,"to":3311.93,"location":2,"content":"fish and you know formally what I have"},{"from":3311.93,"to":3315.23,"location":2,"content":"is I have a way I start there are three"},{"from":3315.23,"to":3317.96,"location":2,"content":"actions I can take and I have a finished"},{"from":3317.96,"to":3322.25,"location":2,"content":"condition for formal pars pars and so"},{"from":3322.25,"to":3325.88,"location":2,"content":"here's what I do so I have a stack which"},{"from":3325.88,"to":3329.99,"location":2,"content":"is on this side and I have a buffer so"},{"from":3329.99,"to":3332.81,"location":2,"content":"the stack is what I have built and the"},{"from":3332.81,"to":3334.52,"location":2,"content":"buffer is all the words in the sentence"},{"from":3334.52,"to":3337.37,"location":2,"content":"I haven't dealt with yet so I start the"},{"from":3337.37,"to":3339.38,"location":2,"content":"pars and that's this sort of instruction"},{"from":3339.38,"to":3342.41,"location":2,"content":"here by putting route my route for my"},{"from":3342.41,"to":3344.87,"location":2,"content":"whole sentence onto my stack and my"},{"from":3344.87,"to":3347.42,"location":2,"content":"buffer is the whole sentence and I"},{"from":3347.42,"to":3350.09,"location":2,"content":"haven't found any dependencies yet okay"},{"from":3350.09,"to":3353.24,"location":2,"content":"and so then the actions I can take is to"},{"from":3353.24,"to":3357.56,"location":2,"content":"shift things onto the stack or to do the"},{"from":3357.56,"to":3360.14,"location":2,"content":"equivalent of a reduce where I build"},{"from":3360.14,"to":3364.67,"location":2,"content":"dependencies so starting off I can't"},{"from":3364.67,"to":3366.59,"location":2,"content":"build a dependency because I only have"},{"from":3366.59,"to":3368.78,"location":2,"content":"route on the stack so the only thing I"},{"from":3368.78,"to":3371.21,"location":2,"content":"can do is shift so I can shift I on to"},{"from":3371.21,"to":3374.63,"location":2,"content":"the stack now I could at this point say"},{"from":3374.63,"to":3376.49,"location":2,"content":"let's build a dependency I as a"},{"from":3376.49,"to":3378.59,"location":2,"content":"dependent of route but that would be the"},{"from":3378.59,"to":3381.5,"location":2,"content":"wrong analysis because really the head"},{"from":3381.5,"to":3383.81,"location":2,"content":"of this sentence is 8 so I'm a clever"},{"from":3383.81,"to":3386.99,"location":2,"content":"boy and I shift again and now I have"},{"from":3386.99,"to":3392.39,"location":2,"content":"route I 8 on the stack ok and so at this"},{"from":3392.39,"to":3395.63,"location":2,"content":"point I'm in a position where hey what"},{"from":3395.63,"to":3398.72,"location":2,"content":"I'm gonna do is reductions that build"},{"from":3398.72,"to":3401.57,"location":2,"content":"structure because look I have I ate here"},{"from":3401.57,"to":3405.95,"location":2,"content":"and I want to be able to say that I is"},{"from":3405.95,"to":3410.12,"location":2,"content":"the subject of dependency of 8 and I"},{"from":3410.12,"to":3414.2,"location":2,"content":"will do that by or by doing a reduction"},{"from":3414.2,"to":3416.57,"location":2,"content":"and so what I'm going to do"},{"from":3416.57,"to":3419.84,"location":2,"content":"is the left arc reduction which says"},{"from":3419.84,"to":3423.14,"location":2,"content":"look I'm gonna treat the second from top"},{"from":3423.14,"to":3425.99,"location":2,"content":"thing on the stack as a dependent of the"},{"from":3425.99,"to":3429.08,"location":2,"content":"thing that's on top of the stack and so"},{"from":3429.08,"to":3433.19,"location":2,"content":"I do that and so when I do that I create"},{"from":3433.19,"to":3435.23,"location":2,"content":"the second from the head thing as a"},{"from":3435.23,"to":3438.26,"location":2,"content":"subject dependent of eight and I leave"},{"from":3438.26,"to":3442.25,"location":2,"content":"the head on the stack eight but I sort"},{"from":3442.25,"to":3443.99,"location":2,"content":"of add this dependencies or the"},{"from":3443.99,"to":3448.07,"location":2,"content":"dependencies I've built okay so I do"},{"from":3448.07,"to":3451.55,"location":2,"content":"that now I could immediately reduce"},{"from":3451.55,"to":3453.98,"location":2,"content":"again and say eight is a dependent of"},{"from":3453.98,"to":3456.98,"location":2,"content":"root but my sentence is actually I ate"},{"from":3456.98,"to":3460.76,"location":2,"content":"fish so what I want to do is say oh"},{"from":3460.76,"to":3463.1,"location":2,"content":"there's still fish on the buffer so what"},{"from":3463.1,"to":3465.77,"location":2,"content":"I should first do is shift again have"},{"from":3465.77,"to":3468.68,"location":2,"content":"root eight fish in my sentence and then"},{"from":3468.68,"to":3470.69,"location":2,"content":"I'll be able to say look I want to now"},{"from":3470.69,"to":3474.77,"location":2,"content":"build the thing on the top of the stack"},{"from":3474.77,"to":3477.68,"location":2,"content":"as a write dependent of the thing that's"},{"from":3477.68,"to":3479.9,"location":2,"content":"second from top of the stack and so"},{"from":3479.9,"to":3482.06,"location":2,"content":"that's referred to as a right arc move"},{"from":3482.06,"to":3486.05,"location":2,"content":"and so I say right arc and sorry do a"},{"from":3486.05,"to":3488.69,"location":2,"content":"reduction where I've generated a new"},{"from":3488.69,"to":3491.66,"location":2,"content":"dependency and I take the two things"},{"from":3491.66,"to":3493.9,"location":2,"content":"that are on top of the stack and say"},{"from":3493.9,"to":3497.69,"location":2,"content":"fish is a dependent of eight and so"},{"from":3497.69,"to":3500.9,"location":2,"content":"therefore I just keep the head I always"},{"from":3500.9,"to":3503.72,"location":2,"content":"just keep the head on the stack and and"},{"from":3503.72,"to":3506.69,"location":2,"content":"I generate this new arc and so at this"},{"from":3506.69,"to":3509.54,"location":2,"content":"point I'm in the same position I want to"},{"from":3509.54,"to":3512.54,"location":2,"content":"say that this eight is a write dependent"},{"from":3512.54,"to":3515.27,"location":2,"content":"of my root and so I'm again going to do"},{"from":3515.27,"to":3520.07,"location":2,"content":"right arc and make this extra dependency"},{"from":3520.07,"to":3520.67,"location":2,"content":"here"},{"from":3520.67,"to":3523.19,"location":2,"content":"okay so then my finished condition for"},{"from":3523.19,"to":3525.02,"location":2,"content":"having successfully passed the sentence"},{"from":3525.02,"to":3528.08,"location":2,"content":"is my buffer is empty and I just have"},{"from":3528.08,"to":3530.63,"location":2,"content":"root left on my stack because that's"},{"from":3530.63,"to":3532.94,"location":2,"content":"what I sort of said back here that was"},{"from":3532.94,"to":3535.43,"location":2,"content":"buffer is empty is my finished condition"},{"from":3535.43,"to":3540.11,"location":2,"content":"okay so I've passed the sentence so that"},{"from":3540.11,"to":3542.45,"location":2,"content":"worked well but you know I actually had"},{"from":3542.45,"to":3545.93,"location":2,"content":"different choices of when to park when"},{"from":3545.93,"to":3548.51,"location":2,"content":"to shift and when to reduce and I just"},{"from":3548.51,"to":3550.23,"location":2,"content":"miraculously made"},{"from":3550.23,"to":3552.9,"location":2,"content":"the right choice at each point and well"},{"from":3552.9,"to":3555.15,"location":2,"content":"one thing you could do at this point is"},{"from":3555.15,"to":3557.82,"location":2,"content":"say well you could have explored every"},{"from":3557.82,"to":3562.74,"location":2,"content":"choice and seen what happened and gone"},{"from":3562.74,"to":3566.34,"location":2,"content":"different parses and I could have but if"},{"from":3566.34,"to":3568.83,"location":2,"content":"that's what I'd done I would have"},{"from":3568.83,"to":3573.48,"location":2,"content":"explored this exponential size tree of"},{"from":3573.48,"to":3576.24,"location":2,"content":"different possible parses and if that"},{"from":3576.24,"to":3578.1,"location":2,"content":"was what I was doing I wouldn't be able"},{"from":3578.1,"to":3580.35,"location":2,"content":"to pass efficiently and indeed that's"},{"from":3580.35,"to":3583.11,"location":2,"content":"not what people did in the 60s 70s and"},{"from":3583.11,"to":3587.73,"location":2,"content":"80s clever people in the 60s said how"},{"from":3587.73,"to":3589.86,"location":2,"content":"rather than doing a crummy search here"},{"from":3589.86,"to":3592.41,"location":2,"content":"we can come up with clever dynamic"},{"from":3592.41,"to":3594.6,"location":2,"content":"programming algorithms and you can"},{"from":3594.6,"to":3596.64,"location":2,"content":"relatively efficiently explore the space"},{"from":3596.64,"to":3600.3,"location":2,"content":"of all possible parses and that was sort"},{"from":3600.3,"to":3601.89,"location":2,"content":"of the mainstay of parsing in those"},{"from":3601.89,"to":3606.51,"location":2,"content":"decades but when y came nefra came along"},{"from":3606.51,"to":3610.65,"location":2,"content":"he said yeah that's true but hey I've"},{"from":3610.65,"to":3614.69,"location":2,"content":"got a clever idea because now it's the"},{"from":3614.69,"to":3618.99,"location":2,"content":"2000s and I know machine learning so"},{"from":3618.99,"to":3622.65,"location":2,"content":"what I could do instead is say I'm at a"},{"from":3622.65,"to":3626.04,"location":2,"content":"particular position in the pars and I'm"},{"from":3626.04,"to":3627.75,"location":2,"content":"gonna build a machine learning"},{"from":3627.75,"to":3629.99,"location":2,"content":"classifier and that machine learning"},{"from":3629.99,"to":3632.76,"location":2,"content":"classifier is going to tell me the next"},{"from":3632.76,"to":3634.92,"location":2,"content":"thing to do it's going to tell me"},{"from":3634.92,"to":3638.94,"location":2,"content":"whether the shift will left our core"},{"from":3638.94,"to":3641.67,"location":2,"content":"right arc so if we're only just sort of"},{"from":3641.67,"to":3643.56,"location":2,"content":"talking about how to build the arrows"},{"from":3643.56,"to":3645.78,"location":2,"content":"they're just three actions shift left"},{"from":3645.78,"to":3648.66,"location":2,"content":"our core right arc if we're also wanting"},{"from":3648.66,"to":3651.03,"location":2,"content":"to put labels on the dependencies and we"},{"from":3651.03,"to":3653.52,"location":2,"content":"have our different labels there are then"},{"from":3653.52,"to":3656.01,"location":2,"content":"sort of to our +1 actions because she"},{"from":3656.01,"to":3659.07,"location":2,"content":"sort of left arc subject or left ugh"},{"from":3659.07,"to":3660.96,"location":2,"content":"object or something like that but anyway"},{"from":3660.96,"to":3663.42,"location":2,"content":"there's a set of actions and so you're"},{"from":3663.42,"to":3665.4,"location":2,"content":"going to build a classifier with machine"},{"from":3665.4,"to":3667.65,"location":2,"content":"learning somehow which will predict the"},{"from":3667.65,"to":3670.92,"location":2,"content":"right action and Wykeham me very showed"},{"from":3670.92,"to":3674.28,"location":2,"content":"the sort of slightly surprising fact"},{"from":3674.28,"to":3677.61,"location":2,"content":"that actually you could predict the"},{"from":3677.61,"to":3681.02,"location":2,"content":"correct action to take with high"},{"from":3681.02,"to":3683.55,"location":2,"content":"accuracy so"},{"from":3683.55,"to":3687.78,"location":2,"content":"in the simplest version of this there's"},{"from":3687.78,"to":3689.97,"location":2,"content":"absolutely no search you just run a"},{"from":3689.97,"to":3692.13,"location":2,"content":"classifier at each step and it says what"},{"from":3692.13,"to":3693.51,"location":2,"content":"you should do next is shift and you"},{"from":3693.51,"to":3695.07,"location":2,"content":"shift and then it says what you should"},{"from":3695.07,"to":3696.84,"location":2,"content":"do is left ah can you left arc and you"},{"from":3696.84,"to":3699.81,"location":2,"content":"run that through and he proved wrong he"},{"from":3699.81,"to":3702.06,"location":2,"content":"showed empirically that even doing that"},{"from":3702.06,"to":3704.61,"location":2,"content":"you could pass sentences with high"},{"from":3704.61,"to":3706.56,"location":2,"content":"accuracy now if you want to do some"},{"from":3706.56,"to":3708.12,"location":2,"content":"searching around you can do a bit better"},{"from":3708.12,"to":3711.42,"location":2,"content":"but it's not necessary and we're not"},{"from":3711.42,"to":3715.26,"location":2,"content":"going to do it for our assignment but so"},{"from":3715.26,"to":3717.6,"location":2,"content":"if you're doing this just sort of run"},{"from":3717.6,"to":3720.18,"location":2,"content":"classifier predict action run classifier"},{"from":3720.18,"to":3722.16,"location":2,"content":"but dict action we then get this"},{"from":3722.16,"to":3724.53,"location":2,"content":"wonderful result which you're meant to"},{"from":3724.53,"to":3726.87,"location":2,"content":"explain a bit honest home the assignment"},{"from":3726.87,"to":3729.93,"location":2,"content":"3 is that what we've built is a linear"},{"from":3729.93,"to":3732.87,"location":2,"content":"time parser right because we're going to"},{"from":3732.87,"to":3735.3,"location":2,"content":"be sort of as we chug through a sentence"},{"from":3735.3,"to":3737.16,"location":2,"content":"we're only doing a linear amount of work"},{"from":3737.16,"to":3739.95,"location":2,"content":"for each word and that was sort of an"},{"from":3739.95,"to":3741.51,"location":2,"content":"enormous breakthrough because although"},{"from":3741.51,"to":3743.58,"location":2,"content":"people in the sixties have come up with"},{"from":3743.58,"to":3745.47,"location":2,"content":"these dynamic programming algorithms"},{"from":3745.47,"to":3747.78,"location":2,"content":"dynamic programming algorithms for"},{"from":3747.78,"to":3751.59,"location":2,"content":"sentences were always cubic or worse and"},{"from":3751.59,"to":3753.3,"location":2,"content":"that's not very good if you don't pass"},{"from":3753.3,"to":3755.16,"location":2,"content":"the whole web whereas if you have"},{"from":3755.16,"to":3757.68,"location":2,"content":"something that's linear time that's"},{"from":3757.68,"to":3761.79,"location":2,"content":"really getting you places ok so this is"},{"from":3761.79,"to":3764.34,"location":2,"content":"the conventional way in which this was"},{"from":3764.34,"to":3768.3,"location":2,"content":"done was you know we have a stack we"},{"from":3768.3,"to":3770.49,"location":2,"content":"might have already built some structure"},{"from":3770.49,"to":3772.59,"location":2,"content":"working out something's dependent of"},{"from":3772.59,"to":3774.87,"location":2,"content":"something we have a buffer of words that"},{"from":3774.87,"to":3776.22,"location":2,"content":"we don't deal with and we want to"},{"from":3776.22,"to":3778.17,"location":2,"content":"predict the next action so the"},{"from":3778.17,"to":3780.57,"location":2,"content":"conventional way to do this is to say"},{"from":3780.57,"to":3783.21,"location":2,"content":"well we want to have features and well"},{"from":3783.21,"to":3786.06,"location":2,"content":"the kind of features you wanted was so"},{"from":3786.06,"to":3788.04,"location":2,"content":"usually some kind of conjunction and"},{"from":3788.04,"to":3790.8,"location":2,"content":"multiple things so that if the top word"},{"from":3790.8,"to":3797.13,"location":2,"content":"of the stack is good and something else"},{"from":3797.13,"to":3800.46,"location":2,"content":"is true right that the second toss stop"},{"from":3800.46,"to":3803.61,"location":2,"content":"word of the stack is has and it's part"},{"from":3803.61,"to":3805.98,"location":2,"content":"of speeches verb then maybe that's an"},{"from":3805.98,"to":3807.93,"location":2,"content":"indicator of do some action so here it"},{"from":3807.93,"to":3810.27,"location":2,"content":"had these very complex binary indicator"},{"from":3810.27,"to":3813.81,"location":2,"content":"features and you'd build and you"},{"from":3813.81,"to":3816.36,"location":2,"content":"literally have millions of these binary"},{"from":3816.36,"to":3817.22,"location":2,"content":"indicator fee"},{"from":3817.22,"to":3819.92,"location":2,"content":"and you'd feed them into some big"},{"from":3819.92,"to":3822.29,"location":2,"content":"logistic regression or support vector"},{"from":3822.29,"to":3824.15,"location":2,"content":"machine or something like that"},{"from":3824.15,"to":3827,"location":2,"content":"and you would build parsers and these"},{"from":3827,"to":3830.3,"location":2,"content":"parses work pretty well but you sort of"},{"from":3830.3,"to":3833.09,"location":2,"content":"had these sort of very complex hand"},{"from":3833.09,"to":3837.05,"location":2,"content":"engineered binary features so in the"},{"from":3837.05,"to":3838.46,"location":2,"content":"last bit of lecture I want to show you"},{"from":3838.46,"to":3841.22,"location":2,"content":"what people have done in the neural"},{"from":3841.22,"to":3843.77,"location":2,"content":"dependency parsing world but before I do"},{"from":3843.77,"to":3847.73,"location":2,"content":"that let me just explain how you how you"},{"from":3847.73,"to":3851.57,"location":2,"content":"evaluate dependency parsers and that's"},{"from":3851.57,"to":3854.09,"location":2,"content":"actually very simple right so what you"},{"from":3854.09,"to":3857.78,"location":2,"content":"do is well you assume because the human"},{"from":3857.78,"to":3859.87,"location":2,"content":"wrote it down that there's a correct"},{"from":3859.87,"to":3862.7,"location":2,"content":"dependency parse for a sentence she saw"},{"from":3862.7,"to":3865.19,"location":2,"content":"the video lecture like this and so these"},{"from":3865.19,"to":3868.01,"location":2,"content":"are the correct arcs and to evaluate our"},{"from":3868.01,"to":3870.74,"location":2,"content":"dependency parser we're simply going to"},{"from":3870.74,"to":3875.06,"location":2,"content":"say which arcs are correct so there are"},{"from":3875.06,"to":3879.53,"location":2,"content":"the gold arcs so there's a gold arc from"},{"from":3879.53,"to":3883.88,"location":2,"content":"two to one she saw subject and there's a"},{"from":3883.88,"to":3887.69,"location":2,"content":"gold arc from 0 to 2 the route of the"},{"from":3887.69,"to":3891.29,"location":2,"content":"sentence these the gold arcs if we"},{"from":3891.29,"to":3892.94,"location":2,"content":"generate a parse and we're going to"},{"from":3892.94,"to":3895.76,"location":2,"content":"propose some arcs as to what is the head"},{"from":3895.76,"to":3898.49,"location":2,"content":"of each word and we're simply going to"},{"from":3898.49,"to":3900.35,"location":2,"content":"count up how many of them are correct"},{"from":3900.35,"to":3903.08,"location":2,"content":"treating each arc individually and there"},{"from":3903.08,"to":3905.09,"location":2,"content":"are two ways we can do that we can"},{"from":3905.09,"to":3909.26,"location":2,"content":"either as we're going to do ignore the"},{"from":3909.26,"to":3913.46,"location":2,"content":"labels and that's then referred to as"},{"from":3913.46,"to":3916.37,"location":2,"content":"the unlabeled attachments score so here"},{"from":3916.37,"to":3919.25,"location":2,"content":"in my example my dependency parser got"},{"from":3919.25,"to":3922.67,"location":2,"content":"most of the arcs right but it got this"},{"from":3922.67,"to":3925.67,"location":2,"content":"one wrong so I say my unlabeled"},{"from":3925.67,"to":3928.4,"location":2,"content":"attachment score is 80% or you can also"},{"from":3928.4,"to":3931.31,"location":2,"content":"look at the labels and then if my parser"},{"from":3931.31,"to":3933.35,"location":2,"content":"wasn't very good at getting the labels"},{"from":3933.35,"to":3935.96,"location":2,"content":"right so I'm only getting 40% and so we"},{"from":3935.96,"to":3938.36,"location":2,"content":"can just count up the number of"},{"from":3938.36,"to":3940.49,"location":2,"content":"dependencies and how many we get correct"},{"from":3940.49,"to":3943.43,"location":2,"content":"and that's our accuracy and in the"},{"from":3943.43,"to":3944.95,"location":2,"content":"assignment you're meant to build a"},{"from":3944.95,"to":3948.11,"location":2,"content":"dependency parser with a certain"},{"from":3948.11,"to":3948.95,"location":2,"content":"accuracy"},{"from":3948.95,"to":3951.09,"location":2,"content":"I forget the number now we're saying"},{"from":3951.09,"to":3953.52,"location":2,"content":"some number 80s something or something"},{"from":3953.52,"to":3957.75,"location":2,"content":"as you're meant to get to okay maybe"},{"from":3957.75,"to":3960.78,"location":2,"content":"I'll skip that okay so now I want to"},{"from":3960.78,"to":3963.57,"location":2,"content":"sort of explain to you just a bit about"},{"from":3963.57,"to":3966.84,"location":2,"content":"neural dependency parsers and why they"},{"from":3966.84,"to":3968.7,"location":2,"content":"are motivated so I mentioned to you"},{"from":3968.7,"to":3972.27,"location":2,"content":"already that the conventional model had"},{"from":3972.27,"to":3976.47,"location":2,"content":"these sort of indicator features of on"},{"from":3976.47,"to":3978.21,"location":2,"content":"the top of the stack is the word good"},{"from":3978.21,"to":3979.8,"location":2,"content":"and the second thing on the stack is the"},{"from":3979.8,"to":3984.78,"location":2,"content":"verb has or on the top of the stack is"},{"from":3984.78,"to":3987.51,"location":2,"content":"some other word and the second top is of"},{"from":3987.51,"to":3989.67,"location":2,"content":"some part of speech and that part of"},{"from":3989.67,"to":3991.11,"location":2,"content":"speech has already been joined at the"},{"from":3991.11,"to":3992.49,"location":2,"content":"dependency of another power of speech"},{"from":3992.49,"to":3995.52,"location":2,"content":"people hand engineer these features and"},{"from":3995.52,"to":3997.11,"location":2,"content":"the problems with that was these"},{"from":3997.11,"to":3999.36,"location":2,"content":"features were very sparse each of these"},{"from":3999.36,"to":4003.38,"location":2,"content":"features matches very few things they"},{"from":4003.38,"to":4005.84,"location":2,"content":"matched some configurations but not"},{"from":4005.84,"to":4007.43,"location":2,"content":"others so the features tend to be"},{"from":4007.43,"to":4011.6,"location":2,"content":"incomplete and there are a lot of them"},{"from":4011.6,"to":4014.18,"location":2,"content":"there are commonly millions of features"},{"from":4014.18,"to":4016.04,"location":2,"content":"and so it turned out that actually"},{"from":4016.04,"to":4018.71,"location":2,"content":"computing these features was just"},{"from":4018.71,"to":4020.63,"location":2,"content":"expensive so that you had some"},{"from":4020.63,"to":4023.3,"location":2,"content":"configuration on your second the buffer"},{"from":4023.3,"to":4025.22,"location":2,"content":"and then you wanted to know which of"},{"from":4025.22,"to":4028.85,"location":2,"content":"these features were active for that"},{"from":4028.85,"to":4031.25,"location":2,"content":"stack and buffer configuration and so"},{"from":4031.25,"to":4033.35,"location":2,"content":"you had to compute features from it and"},{"from":4033.35,"to":4035.12,"location":2,"content":"it turned out that conventional"},{"from":4035.12,"to":4037.46,"location":2,"content":"dependency parsers spent most of their"},{"from":4037.46,"to":4040.46,"location":2,"content":"time computing features that went into"},{"from":4040.46,"to":4042.71,"location":2,"content":"the machine learning model rather than"},{"from":4042.71,"to":4045.41,"location":2,"content":"doing the sort of shifting and reducing"},{"from":4045.41,"to":4049.61,"location":2,"content":"of just a pure parser operation and so"},{"from":4049.61,"to":4052.19,"location":2,"content":"that seemed like it left open the"},{"from":4052.19,"to":4054.83,"location":2,"content":"possibility that well what if we could"},{"from":4054.83,"to":4057.47,"location":2,"content":"get rid of all of this stuff and we can"},{"from":4057.47,"to":4061.01,"location":2,"content":"run a neural network directly on the"},{"from":4061.01,"to":4063.47,"location":2,"content":"stack and buffer configuration then"},{"from":4063.47,"to":4066.23,"location":2,"content":"maybe that would allow us to build a"},{"from":4066.23,"to":4069.62,"location":2,"content":"dependency parser which was faster and"},{"from":4069.62,"to":4072.86,"location":2,"content":"suffered less from issues of sparseness"},{"from":4072.86,"to":4075.52,"location":2,"content":"and a conventional dependency parser and"},{"from":4075.52,"to":4078.53,"location":2,"content":"so that was a project that don t churn"},{"from":4078.53,"to":4083.72,"location":2,"content":"and me tried to do in 2014 mr build and"},{"from":4083.72,"to":4084.24,"location":2,"content":"new"},{"from":4084.24,"to":4087.72,"location":2,"content":"all dependency parser and you know"},{"from":4087.72,"to":4090.33,"location":2,"content":"effectively what we found is that that's"},{"from":4090.33,"to":4093.57,"location":2,"content":"exactly what you could do so here are"},{"from":4093.57,"to":4096.51,"location":2,"content":"sort of a few steps here so these are"},{"from":4096.51,"to":4101.49,"location":2,"content":"these same UAS and la s so mod Parsa was"},{"from":4101.49,"to":4105.81,"location":2,"content":"working near Esparza that I sort of were"},{"from":4105.81,"to":4109.74,"location":2,"content":"so showing before and if got a UAS on"},{"from":4109.74,"to":4114,"location":2,"content":"this data of eighty nine point eight but"},{"from":4114,"to":4116.34,"location":2,"content":"everybody loved it and the reason they"},{"from":4116.34,"to":4118.56,"location":2,"content":"loved it is it could pass at four"},{"from":4118.56,"to":4121.16,"location":2,"content":"hundred sixty nine sentences a second"},{"from":4121.16,"to":4123.81,"location":2,"content":"there had been other people that worked"},{"from":4123.81,"to":4128.67,"location":2,"content":"out different more complex ways of doing"},{"from":4128.67,"to":4131.01,"location":2,"content":"parsing with so-called graph based"},{"from":4131.01,"to":4132.96,"location":2,"content":"dependency parsers so this is another"},{"from":4132.96,"to":4136.29,"location":2,"content":"famous dependency parser from the 90s so"},{"from":4136.29,"to":4139.35,"location":2,"content":"it was actually you know a bit more"},{"from":4139.35,"to":4141.72,"location":2,"content":"accurate but it was a bit more accurate"},{"from":4141.72,"to":4144.24,"location":2,"content":"at the at the cost of being two orders"},{"from":4144.24,"to":4146.46,"location":2,"content":"of magnitude slower and you know people"},{"from":4146.46,"to":4148.38,"location":2,"content":"have worked on top of that so here's an"},{"from":4148.38,"to":4151.29,"location":2,"content":"an even more complex graph based parser"},{"from":4151.29,"to":4154.53,"location":2,"content":"from the 2000s and well you know it's a"},{"from":4154.53,"to":4156.87,"location":2,"content":"little bit more accurate again but it's"},{"from":4156.87,"to":4160.26,"location":2,"content":"gone even slower okay so what we were"},{"from":4160.26,"to":4164.22,"location":2,"content":"able to show is that using the idea of"},{"from":4164.22,"to":4167.01,"location":2,"content":"instead using a neural network to make"},{"from":4167.01,"to":4171,"location":2,"content":"the decisions of Wykeham neveress style"},{"from":4171,"to":4173.85,"location":2,"content":"shift reduce parser we could produce"},{"from":4173.85,"to":4177.87,"location":2,"content":"something that was almost as accurate as"},{"from":4177.87,"to":4180.48,"location":2,"content":"the very best parsers available at that"},{"from":4180.48,"to":4183.27,"location":2,"content":"time I mean strictly we won over here"},{"from":4183.27,"to":4186.84,"location":2,"content":"and we were a fraction behind on UAS but"},{"from":4186.84,"to":4190.11,"location":2,"content":"you know it was not only just as fast as"},{"from":4190.11,"to":4193.53,"location":2,"content":"never Esparza it was actually faster"},{"from":4193.53,"to":4195.84,"location":2,"content":"than ever Esparza because we didn't have"},{"from":4195.84,"to":4198.24,"location":2,"content":"to spend as much time on feature"},{"from":4198.24,"to":4200.64,"location":2,"content":"computation and that's actually almost a"},{"from":4200.64,"to":4203.37,"location":2,"content":"surprising result right it's not that we"},{"from":4203.37,"to":4205.35,"location":2,"content":"didn't have to do anything we had to do"},{"from":4205.35,"to":4207.27,"location":2,"content":"matrix multiplies in our neural network"},{"from":4207.27,"to":4210.54,"location":2,"content":"but it turned out you could do the"},{"from":4210.54,"to":4213.39,"location":2,"content":"matrix multiplies more quickly than the"},{"from":4213.39,"to":4216.03,"location":2,"content":"feature computation that he was doing"},{"from":4216.03,"to":4217.5,"location":2,"content":"even though at the end of the day it was"},{"from":4217.5,"to":4218.01,"location":2,"content":"sort of"},{"from":4218.01,"to":4220.11,"location":2,"content":"up weights that went into a support"},{"from":4220.11,"to":4222.63,"location":2,"content":"vector machine so that was kind of cool"},{"from":4222.63,"to":4224.85,"location":2,"content":"and so the secret was we're going to"},{"from":4224.85,"to":4227.01,"location":2,"content":"make use of distributed representations"},{"from":4227.01,"to":4231.78,"location":2,"content":"like we've already seen for words so for"},{"from":4231.78,"to":4234.48,"location":2,"content":"each word we're going to represent it as"},{"from":4234.48,"to":4237.45,"location":2,"content":"a word embedding like we've already seen"},{"from":4237.45,"to":4241.74,"location":2,"content":"and in particular we are going to make"},{"from":4241.74,"to":4245.28,"location":2,"content":"use of word vectors and use them as they"},{"from":4245.28,"to":4247.38,"location":2,"content":"represent the starting representations"},{"from":4247.38,"to":4250.29,"location":2,"content":"of words in our parser but well if we're"},{"from":4250.29,"to":4251.7,"location":2,"content":"interested in distributed"},{"from":4251.7,"to":4254.28,"location":2,"content":"representations it seemed to us like"},{"from":4254.28,"to":4256.38,"location":2,"content":"maybe you should only have distributed"},{"from":4256.38,"to":4259.95,"location":2,"content":"representations of words maybe it'd also"},{"from":4259.95,"to":4261.21,"location":2,"content":"be good to have distributed"},{"from":4261.21,"to":4263.52,"location":2,"content":"representations of other things so we"},{"from":4263.52,"to":4265.98,"location":2,"content":"had parts of speech like you know nouns"},{"from":4265.98,"to":4268.38,"location":2,"content":"and verbs and adjectives and so on well"},{"from":4268.38,"to":4270.6,"location":2,"content":"some of those parts of speech have more"},{"from":4270.6,"to":4273.54,"location":2,"content":"to do with each other than others I mean"},{"from":4273.54,"to":4278.37,"location":2,"content":"in particular most NLP work uses"},{"from":4278.37,"to":4280.74,"location":2,"content":"fine-grain parts of speech so you don't"},{"from":4280.74,"to":4282.75,"location":2,"content":"only have a part of speech like noun or"},{"from":4282.75,"to":4285.21,"location":2,"content":"verb you have parts of speech like"},{"from":4285.21,"to":4288.51,"location":2,"content":"singular noun versus plural noun and you"},{"from":4288.51,"to":4291.42,"location":2,"content":"have different parts of speech for you"},{"from":4291.42,"to":4295.2,"location":2,"content":"know work works working kind of the"},{"from":4295.2,"to":4297.45,"location":2,"content":"different forms of verbs are given"},{"from":4297.45,"to":4301.38,"location":2,"content":"different parts of speech as well so"},{"from":4301.38,"to":4303,"location":2,"content":"there are sort of sets of parts of"},{"from":4303,"to":4305.58,"location":2,"content":"speech labels that kind of clusters so"},{"from":4305.58,"to":4306.87,"location":2,"content":"maybe we could have distributed"},{"from":4306.87,"to":4309.36,"location":2,"content":"representations apart speech then"},{"from":4309.36,"to":4312.51,"location":2,"content":"represent their similarity why not well"},{"from":4312.51,"to":4314.16,"location":2,"content":"if we're going to do that why not just"},{"from":4314.16,"to":4316.56,"location":2,"content":"keep on going and say the dependency"},{"from":4316.56,"to":4319.44,"location":2,"content":"labels they also have a distributed"},{"from":4319.44,"to":4322.41,"location":2,"content":"representation and so we built a"},{"from":4322.41,"to":4325.38,"location":2,"content":"representation that did that so the idea"},{"from":4325.38,"to":4330.99,"location":2,"content":"is that we have in our stack the sort of"},{"from":4330.99,"to":4333.78,"location":2,"content":"the top positions of the stack the first"},{"from":4333.78,"to":4336.3,"location":2,"content":"positions of the buffer and for each of"},{"from":4336.3,"to":4338.97,"location":2,"content":"those positions we have a word and a"},{"from":4338.97,"to":4341.07,"location":2,"content":"part of speech and if we've already"},{"from":4341.07,"to":4344.31,"location":2,"content":"built structure as here we kind of know"},{"from":4344.31,"to":4346.8,"location":2,"content":"about dependency that's already been"},{"from":4346.8,"to":4349.17,"location":2,"content":"built and so we've got a triple for each"},{"from":4349.17,"to":4351.33,"location":2,"content":"position and we're going to"},{"from":4351.33,"to":4354.83,"location":2,"content":"convert all of those into a distributed"},{"from":4354.83,"to":4358.29,"location":2,"content":"representation which we are learning and"},{"from":4358.29,"to":4360.47,"location":2,"content":"we're going to use those distributed"},{"from":4360.47,"to":4366.24,"location":2,"content":"representations to build our parser okay"},{"from":4366.24,"to":4369.23,"location":2,"content":"now for so you know starting from"},{"from":4369.23,"to":4372.18,"location":2,"content":"starting from the next lecture forward"},{"from":4372.18,"to":4375.96,"location":2,"content":"we're going to sort of start using more"},{"from":4375.96,"to":4379.32,"location":2,"content":"complex forms of neural models but for"},{"from":4379.32,"to":4383.25,"location":2,"content":"this model we did it in a sort of a very"},{"from":4383.25,"to":4386.73,"location":2,"content":"simple straightforward way we said well"},{"from":4386.73,"to":4390.63,"location":2,"content":"we could just use exactly the same model"},{"from":4390.63,"to":4393.45,"location":2,"content":"exactly the same parse of structure that"},{"from":4393.45,"to":4396.63,"location":2,"content":"never used right doing those shifts and"},{"from":4396.63,"to":4400.05,"location":2,"content":"left arcs and right arcs the only part"},{"from":4400.05,"to":4401.28,"location":2,"content":"we're going to turn into a neural"},{"from":4401.28,"to":4403.53,"location":2,"content":"network is we're going to have the"},{"from":4403.53,"to":4405.62,"location":2,"content":"decision of what to do next"},{"from":4405.62,"to":4408.08,"location":2,"content":"being controlled by our neural network"},{"from":4408.08,"to":4412.32,"location":2,"content":"so our neural network is just a very"},{"from":4412.32,"to":4415.23,"location":2,"content":"simple classifier of the kind that we"},{"from":4415.23,"to":4418.05,"location":2,"content":"were talking about last week so based on"},{"from":4418.05,"to":4422.1,"location":2,"content":"the configuration we create an input"},{"from":4422.1,"to":4425.52,"location":2,"content":"layer which means we sort of taking the"},{"from":4425.52,"to":4428.4,"location":2,"content":"stuff in these boxes and turn and"},{"from":4428.4,"to":4430.71,"location":2,"content":"looking up a vector representation for"},{"from":4430.71,"to":4433.26,"location":2,"content":"each one and concatenating them together"},{"from":4433.26,"to":4436.98,"location":2,"content":"to produce a input representation that's"},{"from":4436.98,"to":4438.84,"location":2,"content":"sort of similar to when we were making"},{"from":4438.84,"to":4441.39,"location":2,"content":"those would window classifiers and we"},{"from":4441.39,"to":4443.22,"location":2,"content":"can concatenate it a bunch of stuff"},{"from":4443.22,"to":4445.68,"location":2,"content":"together so that gives us an our input"},{"from":4445.68,"to":4449.25,"location":2,"content":"layer so from there we put things"},{"from":4449.25,"to":4451.53,"location":2,"content":"through a hidden layer just like last"},{"from":4451.53,"to":4455.28,"location":2,"content":"week we do W X plus B and then put it"},{"from":4455.28,"to":4457.14,"location":2,"content":"through a rel your non-linearity to a"},{"from":4457.14,"to":4459.42,"location":2,"content":"hidden layer and then on top of that"},{"from":4459.42,"to":4462.33,"location":2,"content":"we're simply gonna stick a soft max"},{"from":4462.33,"to":4464.76,"location":2,"content":"output layer so we're multiplying by"},{"from":4464.76,"to":4468.42,"location":2,"content":"another matrix adding another bias term"},{"from":4468.42,"to":4470.73,"location":2,"content":"and then that goes into the soft max"},{"from":4470.73,"to":4472.71,"location":2,"content":"which is going to give a probability"},{"from":4472.71,"to":4475.92,"location":2,"content":"over our actions as to whether shift"},{"from":4475.92,"to":4478.14,"location":2,"content":"left our core right arc or the"},{"from":4478.14,"to":4480.39,"location":2,"content":"corresponding one with labels and then"},{"from":4480.39,"to":4482.16,"location":2,"content":"we're going to use the same kind of"},{"from":4482.16,"to":4484.89,"location":2,"content":"cross entropy loss to say how good"},{"from":4484.89,"to":4487.68,"location":2,"content":"job did we do at guessing the action"},{"from":4487.68,"to":4489.84,"location":2,"content":"that we should have taken according to"},{"from":4489.84,"to":4491.91,"location":2,"content":"the tree Bank powers of the sentence and"},{"from":4491.91,"to":4496.35,"location":2,"content":"so each step of the shift reduce parser"},{"from":4496.35,"to":4498.45,"location":2,"content":"we're making a decision of what to do"},{"from":4498.45,"to":4500.31,"location":2,"content":"next and we're doing it by this"},{"from":4500.31,"to":4503.1,"location":2,"content":"classifier and we're getting a loss to"},{"from":4503.1,"to":4504.3,"location":2,"content":"the extent that we don't give"},{"from":4504.3,"to":4507.81,"location":2,"content":"probability one to the right action and"},{"from":4507.81,"to":4510.78,"location":2,"content":"so that's what we did using the tree"},{"from":4510.78,"to":4516.18,"location":2,"content":"Bank we trained up our parser and it was"},{"from":4516.18,"to":4519.72,"location":2,"content":"then able to predict the sentences and"},{"from":4519.72,"to":4524.73,"location":2,"content":"the cool thing the cool thing was that"},{"from":4524.73,"to":4528.12,"location":2,"content":"this had all the good things of never"},{"from":4528.12,"to":4531.87,"location":2,"content":"Esparza but you know by having it use"},{"from":4531.87,"to":4534.18,"location":2,"content":"these dense representations it meant"},{"from":4534.18,"to":4536.1,"location":2,"content":"that we could get greater accuracy and"},{"from":4536.1,"to":4539.46,"location":2,"content":"speed than near Esparza at the same time"},{"from":4539.46,"to":4543.03,"location":2,"content":"so here are sort of some results on that"},{"from":4543.03,"to":4545.4,"location":2,"content":"I mean I already showed you some earlier"},{"from":4545.4,"to":4547.95,"location":2,"content":"results right so this was showing the"},{"from":4547.95,"to":4551.01,"location":2,"content":"fact that you know we're outperforming"},{"from":4551.01,"to":4553.61,"location":2,"content":"these earlier parsers basically but"},{"from":4553.61,"to":4557.49,"location":2,"content":"subsequent to us doing this work people"},{"from":4557.49,"to":4561,"location":2,"content":"at Google these papers here"},{"from":4561,"to":4564.48,"location":2,"content":"by Weiss and and/or they said well this"},{"from":4564.48,"to":4567.3,"location":2,"content":"is pretty cool maybe we can get the"},{"from":4567.3,"to":4569.88,"location":2,"content":"numbers even better if we make our"},{"from":4569.88,"to":4574.23,"location":2,"content":"neural network bigger and deeper and we"},{"from":4574.23,"to":4576.45,"location":2,"content":"spend a lot more time tuning our hyper"},{"from":4576.45,"to":4580.26,"location":2,"content":"parameters sad but true all of these"},{"from":4580.26,"to":4582.15,"location":2,"content":"things help when you're building neural"},{"from":4582.15,"to":4583.74,"location":2,"content":"networks and when you're doing your"},{"from":4583.74,"to":4586.47,"location":2,"content":"final project sometimes the answers and"},{"from":4586.47,"to":4588.66,"location":2,"content":"making the results better is to make it"},{"from":4588.66,"to":4591.33,"location":2,"content":"bigger deeper and spend more time tuning"},{"from":4591.33,"to":4594.24,"location":2,"content":"the hyper parameters they put in beam"},{"from":4594.24,"to":4596.7,"location":2,"content":"search as I sort of mentioned beam"},{"from":4596.7,"to":4599.07,"location":2,"content":"search can really help so in beam search"},{"from":4599.07,"to":4602.88,"location":2,"content":"you know rather than just saying let's"},{"from":4602.88,"to":4605.01,"location":2,"content":"work out what's the best next action do"},{"from":4605.01,"to":4607.05,"location":2,"content":"that one and repeat over you allow"},{"from":4607.05,"to":4609.06,"location":2,"content":"yourself to do a little bit of search"},{"from":4609.06,"to":4610.86,"location":2,"content":"you sort of say let's consider two"},{"from":4610.86,"to":4615.94,"location":2,"content":"actions and explore what happens"},{"from":4615.94,"to":4618.64,"location":2,"content":"humans always agreed on how to build"},{"from":4618.64,"to":4620.91,"location":2,"content":"these trees and if they don't work"},{"from":4620.91,"to":4624.99,"location":2,"content":"accurately or agreement of humans so"},{"from":4624.99,"to":4627.79,"location":2,"content":"that's a good question which I haven't"},{"from":4627.79,"to":4630.88,"location":2,"content":"addressed humans don't always agree"},{"from":4630.88,"to":4633.13,"location":2,"content":"there are sort of two reasons they can't"},{"from":4633.13,"to":4635.83,"location":2,"content":"agree fundamentally one is that the"},{"from":4635.83,"to":4638.68,"location":2,"content":"humans sort of mess up right because"},{"from":4638.68,"to":4640.66,"location":2,"content":"human work is doing this aren't perfect"},{"from":4640.66,"to":4643,"location":2,"content":"and the other one is they genuinely"},{"from":4643,"to":4644.2,"location":2,"content":"think that there should be different"},{"from":4644.2,"to":4647.98,"location":2,"content":"structures so you know it depends varies"},{"from":4647.98,"to":4650.23,"location":2,"content":"depending on the circumstances and so on"},{"from":4650.23,"to":4651.97,"location":2,"content":"if you just get humans to parse"},{"from":4651.97,"to":4655.39,"location":2,"content":"sentences and say well what is the"},{"from":4655.39,"to":4657.4,"location":2,"content":"agreement and what they produced you"},{"from":4657.4,"to":4658.99,"location":2,"content":"know maybe you're only getting something"},{"from":4658.99,"to":4662.44,"location":2,"content":"like 92% but you know if you then do an"},{"from":4662.44,"to":4666.07,"location":2,"content":"adjudication phase and you say look at"},{"from":4666.07,"to":4669.64,"location":2,"content":"these differences is one of them right"},{"from":4669.64,"to":4671.35,"location":2,"content":"or wrong there are a lot of them where"},{"from":4671.35,"to":4673.42,"location":2,"content":"you know one of person is effectively"},{"from":4673.42,"to":4674.83,"location":2,"content":"saying are year I goofed"},{"from":4674.83,"to":4677.22,"location":2,"content":"I wasn't paying attention or whatever"},{"from":4677.22,"to":4680.44,"location":2,"content":"and so then what's the residual rate in"},{"from":4680.44,"to":4684.16,"location":2,"content":"which people actually disagree about"},{"from":4684.16,"to":4686.2,"location":2,"content":"possible parses I think that's sort of"},{"from":4686.2,"to":4689.74,"location":2,"content":"more around three percent yeah but there"},{"from":4689.74,"to":4691.69,"location":2,"content":"certainly are cases and that includes"},{"from":4691.69,"to":4693.13,"location":2,"content":"some of the prepositional phrase"},{"from":4693.13,"to":4696.04,"location":2,"content":"attachment ambiguities sometimes there"},{"from":4696.04,"to":4697.99,"location":2,"content":"are multiple attachments that sort of"},{"from":4697.99,"to":4699.55,"location":2,"content":"seem plausible was not really clear"},{"from":4699.55,"to":4701.32,"location":2,"content":"which one is right even though there are"},{"from":4701.32,"to":4703.03,"location":2,"content":"lots of other circumstances where one of"},{"from":4703.03,"to":4710.08,"location":2,"content":"them is very clearly wrong yeah there's"},{"from":4710.08,"to":4712.48,"location":2,"content":"there's still room to do better I mean"},{"from":4712.48,"to":4714.31,"location":2,"content":"at the under labeled attachment score"},{"from":4714.31,"to":4716.08,"location":2,"content":"it's actually Stein get pretty good but"},{"from":4716.08,"to":4719.37,"location":2,"content":"there's still room to do better yeah"},{"from":4719.37,"to":4722.95,"location":2,"content":"yeah so beam search a final thing that"},{"from":4722.95,"to":4724.48,"location":2,"content":"they did was that we're not going to"},{"from":4724.48,"to":4725.71,"location":2,"content":"talk about here is this sort of more"},{"from":4725.71,"to":4728.41,"location":2,"content":"global inference to make sure it's"},{"from":4728.41,"to":4733.45,"location":2,"content":"sensible and so that then led to Google"},{"from":4733.45,"to":4735.01,"location":2,"content":"developing these models that they gave"},{"from":4735.01,"to":4737.47,"location":2,"content":"silly names to especially the Parsee"},{"from":4737.47,"to":4743.17,"location":2,"content":"macaws face model of parsing and so yeah"},{"from":4743.17,"to":4745.66,"location":2,"content":"so that then that sort of pushed up the"},{"from":4745.66,"to":4747.76,"location":2,"content":"numbers even further so that they were"},{"from":4747.76,"to":4749.8,"location":2,"content":"sort of getting close to"},{"from":4749.8,"to":4752.2,"location":2,"content":"ninety-five percent unlabeled accuracy"},{"from":4752.2,"to":4754.45,"location":2,"content":"score from these models and actually is"},{"from":4754.45,"to":4757.51,"location":2,"content":"this workers kind of you know deep"},{"from":4757.51,"to":4759.88,"location":2,"content":"learning people like to optimize this"},{"from":4759.88,"to":4762.1,"location":2,"content":"work has continued along in the"},{"from":4762.1,"to":4763.57,"location":2,"content":"intervening two years and the numbers"},{"from":4763.57,"to":4766.96,"location":2,"content":"are soaking a bit higher again but you"},{"from":4766.96,"to":4771.16,"location":2,"content":"know so this actually led to sort of a"},{"from":4771.16,"to":4774.61,"location":2,"content":"new era of sort of better parsers"},{"from":4774.61,"to":4777.46,"location":2,"content":"because sort of effectively this was the"},{"from":4777.46,"to":4781.93,"location":2,"content":"90s the 90s era of parsers that was sort"},{"from":4781.93,"to":4784.81,"location":2,"content":"of we're around ninety percent and then"},{"from":4784.81,"to":4787.33,"location":2,"content":"going into this sort of new generation"},{"from":4787.33,"to":4790.9,"location":2,"content":"of neural transition based dependency"},{"from":4790.9,"to":4793.24,"location":2,"content":"parsers we sort of have gone down that"},{"from":4793.24,"to":4795.55,"location":2,"content":"we've halved that error error rate and"},{"from":4795.55,"to":4797.59,"location":2,"content":"we're now down to sort of about a five"},{"from":4797.59,"to":4800.08,"location":2,"content":"percent error rate yeah I'm basically"},{"from":4800.08,"to":4801.67,"location":2,"content":"out of time now but you know there is"},{"from":4801.67,"to":4803.23,"location":2,"content":"full of the work including here at"},{"from":4803.23,"to":4806.32,"location":2,"content":"Stanford another student Tim Dozier has"},{"from":4806.32,"to":4807.79,"location":2,"content":"some sort of more recent work that's"},{"from":4807.79,"to":4809.68,"location":2,"content":"more accurate than 95 percent right so"},{"from":4809.68,"to":4811.72,"location":2,"content":"it was still going on but I think I'd"},{"from":4811.72,"to":4814.27,"location":2,"content":"better stop here today and that's neural"},{"from":4814.27,"to":4817.14,"location":2,"content":"dependency parsing"}]} \ No newline at end of file diff --git a/bcc-en/6.bcc b/bcc-en/6.bcc new file mode 100644 index 0000000000000000000000000000000000000000..97fe389ad86e6f630fdfaeb0335fd6ea6e7a35cc --- /dev/null +++ b/bcc-en/6.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.21,"to":7.95,"location":2,"content":"Hi, everyone. I'm Abby,"},{"from":7.95,"to":9.54,"location":2,"content":"I'm the head TA for this class"},{"from":9.54,"to":12.51,"location":2,"content":"and I'm also a PhD student in the Stanford NLP group."},{"from":12.51,"to":14.67,"location":2,"content":"And today I'm gonna be telling you about"},{"from":14.67,"to":17.04,"location":2,"content":"language models and recurrent neural networks."},{"from":17.04,"to":19.98,"location":2,"content":"So, here's an overview of what we're gonna do today."},{"from":19.98,"to":24.89,"location":2,"content":"Today, first, we're going to introduce a new NLP task, that's language modelling,"},{"from":24.89,"to":29.5,"location":2,"content":"and that's going to motivate us to learn about a new family of neural networks,"},{"from":29.5,"to":32.73,"location":2,"content":"that is recurrent neural networks or RNNs."},{"from":32.73,"to":34.44,"location":2,"content":"So, I'd say that these are two of"},{"from":34.44,"to":37.41,"location":2,"content":"the most important ideas you're going to learn for the rest of the course."},{"from":37.41,"to":41.07,"location":2,"content":"So, we're going to be covering some fairly cool material today."},{"from":41.07,"to":44.3,"location":2,"content":"So, let's start off with language modeling."},{"from":44.3,"to":48.77,"location":2,"content":"Language modeling is the task of predicting what word comes next."},{"from":48.77,"to":52.23,"location":2,"content":"So, given this piece of text the students opens their blank,"},{"from":52.23,"to":58.37,"location":2,"content":"could anyone shout out a word which you think might be coming next?"},{"from":58.37,"to":59.55,"location":2,"content":"Purpose. [NOISE]."},{"from":59.55,"to":63.84,"location":2,"content":"[OVERLAPPING] Mind, what else? I didn't quite hear them,"},{"from":63.84,"to":66.72,"location":2,"content":"but, uh, yeah, these are all likely things, right?"},{"from":66.72,"to":68.15,"location":2,"content":"So, these are some things which I thought,"},{"from":68.15,"to":69.49,"location":2,"content":"students might be opening, uh,"},{"from":69.49,"to":71.2,"location":2,"content":"students open their books, seems likely."},{"from":71.2,"to":73.28,"location":2,"content":"Uh, students open their laptops,"},{"from":73.28,"to":75.04,"location":2,"content":"students open their exams,"},{"from":75.04,"to":76.58,"location":2,"content":"Students open their minds, incredibly,"},{"from":76.58,"to":78.7,"location":2,"content":"someone came up with one, that one just now,"},{"from":78.7,"to":80.33,"location":2,"content":"uh, it's kind of a metaphorical meaning of opening."},{"from":80.33,"to":83.67,"location":2,"content":"So, you are all performing language modeling right now."},{"from":83.67,"to":85.47,"location":2,"content":"And thinking about what word comes next,"},{"from":85.47,"to":87.25,"location":2,"content":"you are being a language model."},{"from":87.25,"to":91.39,"location":2,"content":"So, here's a more formal definition of what a language model is."},{"from":91.39,"to":94.43,"location":2,"content":"Given a sequence of words X1 up to Xt,"},{"from":94.43,"to":97.34,"location":2,"content":"a language model, is something that computes"},{"from":97.34,"to":101.2,"location":2,"content":"the probability distribution of the next word, Xt plus 1."},{"from":101.2,"to":104.19,"location":2,"content":"So, a language model comes up with the probability distribution,"},{"from":104.19,"to":109.07,"location":2,"content":"the conditional probability, of what X t plus 1 is given the words it found."},{"from":109.07,"to":110.81,"location":2,"content":"And here we're assuming that, Xt plus 1"},{"from":110.81,"to":113.96,"location":2,"content":"can be any word w from a fixed vocabulary V."},{"from":113.96,"to":115.52,"location":2,"content":"So we are assuming that there is"},{"from":115.52,"to":118.2,"location":2,"content":"a pre-defined list of words that we're considering."},{"from":118.2,"to":120.14,"location":2,"content":"In this way, you can view language modeling"},{"from":120.14,"to":121.85,"location":2,"content":"as a type of classification task,"},{"from":121.85,"to":124.58,"location":2,"content":"because there's a predefined number of possibilities."},{"from":124.58,"to":129.85,"location":2,"content":"Um, we call a system that does this a language model."},{"from":129.85,"to":132.05,"location":2,"content":"There's an alternative way of thinking"},{"from":132.05,"to":133.72,"location":2,"content":"about a language model as well."},{"from":133.72,"to":135.2,"location":2,"content":"You can think of a language model"},{"from":135.2,"to":139.06,"location":2,"content":"as a system which assigns probability to a piece of text."},{"from":139.06,"to":141.47,"location":2,"content":"So, for example, if we have some piece of text,"},{"from":141.47,"to":143.18,"location":2,"content":"X up to X capital T,"},{"from":143.18,"to":145.04,"location":2,"content":"then, the probability of this text"},{"from":145.04,"to":147.83,"location":2,"content":"according to the language model can be broken down."},{"from":147.83,"to":149.25,"location":2,"content":"So, just by definition,"},{"from":149.25,"to":151.25,"location":2,"content":"you can say that the probability is equal to,"},{"from":151.25,"to":154.53,"location":2,"content":"the product of all of these conditional probabilities."},{"from":154.53,"to":157.38,"location":2,"content":"And, uh, the form inside,"},{"from":157.38,"to":160.48,"location":2,"content":"the products is exactly what a language model provides."},{"from":160.48,"to":162.47,"location":2,"content":"So, you can think of these things as somewhat equivalent."},{"from":162.47,"to":164.72,"location":2,"content":"Predicting next words, gives you a system,"},{"from":164.72,"to":169.27,"location":2,"content":"that can give the probability of a given piece of text."},{"from":169.27,"to":172.75,"location":2,"content":"So, in fact, you, use language models every day."},{"from":172.75,"to":176.12,"location":2,"content":"For example, when you're texting on your phone and you're writing a message,"},{"from":176.12,"to":177.61,"location":2,"content":"then most likely if you have a smartphone,"},{"from":177.61,"to":180.08,"location":2,"content":"it will be predicting what word you might be about to say."},{"from":180.08,"to":181.79,"location":2,"content":"So, if you say, um, I'll meet you at the-"},{"from":181.79,"to":184.25,"location":2,"content":"your phone might suggest perhaps you mean airport or cafe,"},{"from":184.25,"to":186.03,"location":2,"content":"or office, for example."},{"from":186.03,"to":188.91,"location":2,"content":"Another situation which you use language models every day"},{"from":188.91,"to":191.5,"location":2,"content":"is when you search for something on the internet, for example, Google,"},{"from":191.5,"to":192.83,"location":2,"content":"and you start typing your query,"},{"from":192.83,"to":195.95,"location":2,"content":"then Google tries to complete your query for you, and that's language modeling."},{"from":195.95,"to":200.48,"location":2,"content":"It's predicting what word or words might come next."},{"from":200.48,"to":203.72,"location":2,"content":"So, that's what a language model is,"},{"from":203.72,"to":206.68,"location":2,"content":"and the question is, how would you learn a language model?"},{"from":206.68,"to":209.92,"location":2,"content":"So, if I was to ask that question in the pre- deep learning era,"},{"from":209.92,"to":211.73,"location":2,"content":"which was really only a few years ago,"},{"from":211.73,"to":215,"location":2,"content":"the answer would be, you would learn a n-gram language model."},{"from":215,"to":218.57,"location":2,"content":"So, today first we're going to learn about n-gram language models."},{"from":218.57,"to":221.33,"location":2,"content":"So, before I can tell you what a n-gram language model is,"},{"from":221.33,"to":223.16,"location":2,"content":"you need to know what an n-gram is."},{"from":223.16,"to":227.91,"location":2,"content":"So, by definition an n-gram is a chunk of n consecutive words."},{"from":227.91,"to":230.4,"location":2,"content":"So, for example, a one gram or unigram,"},{"from":230.4,"to":232.05,"location":2,"content":"is just all of the individual words"},{"from":232.05,"to":235.02,"location":2,"content":"in the sequence that would be \"the students open the-\""},{"from":235.02,"to":238.81,"location":2,"content":"A two gram or bigram would be all of the consecutive chunks of pairs of words,"},{"from":238.81,"to":240.98,"location":2,"content":"\"the students\", \"students opened\", \"opened their\""},{"from":240.98,"to":245.05,"location":2,"content":"and so on for trigrams and four-grams, etc."},{"from":245.05,"to":248.57,"location":2,"content":"So, the core idea of an n-gram language model"},{"from":248.57,"to":251.03,"location":2,"content":"is that in order to predict what word comes next,"},{"from":251.03,"to":252.81,"location":2,"content":"you're going to collect a bunch of statistics,"},{"from":252.81,"to":254.93,"location":2,"content":"about how frequent different n-grams are,"},{"from":254.93,"to":256.49,"location":2,"content":"from some kind of training data,"},{"from":256.49,"to":258.11,"location":2,"content":"and then you can use those statistics"},{"from":258.11,"to":261.83,"location":2,"content":"to predict what next words might be likely."},{"from":261.83,"to":263.64,"location":2,"content":"Here is some more detail."},{"from":263.64,"to":266.32,"location":2,"content":"So, to make an n-gram language model,"},{"from":266.32,"to":268.49,"location":2,"content":"first you need to make a simplifying assumption,"},{"from":268.49,"to":270.31,"location":2,"content":"and this your assumption."},{"from":270.31,"to":273.35,"location":2,"content":"You say that the next word Xt plus 1"},{"from":273.35,"to":277.54,"location":2,"content":"depends only on the preceding N-1 words."},{"from":277.54,"to":279.9,"location":2,"content":"So, what we're assuming,"},{"from":279.9,"to":281.65,"location":2,"content":"is that the probability distribution,"},{"from":281.65,"to":285.02,"location":2,"content":"the conditional probability of Xt plus 1 given all of the words they follow,"},{"from":285.02,"to":286.16,"location":2,"content":"we're just going to simplify that,"},{"from":286.16,"to":290.49,"location":2,"content":"and say it only depends on the last N-1 words, and that's our assumption."},{"from":290.49,"to":293.95,"location":2,"content":"So, by the definition of conditional probability,"},{"from":293.95,"to":295.6,"location":2,"content":"we can say that this probability,"},{"from":295.6,"to":298.38,"location":2,"content":"is just the ratio of two different probabilities."},{"from":298.38,"to":301.18,"location":2,"content":"So, on the top, you've got the probability of"},{"from":301.18,"to":303.22,"location":2,"content":"a particular n-gram and on the bottom we've"},{"from":303.22,"to":306.19,"location":2,"content":"got the probability of a particular N-1 gram"},{"from":306.19,"to":308.02,"location":2,"content":"This is a little hard to read because of all the superscripts"},{"from":308.02,"to":311.01,"location":2,"content":"but I'm gonna give an example with words on the next slide."},{"from":311.01,"to":315.06,"location":2,"content":"Okay. So, that's the definition of the probability of the next word,"},{"from":315.06,"to":317.14,"location":2,"content":"but the question remains, how do we get all of"},{"from":317.14,"to":319.98,"location":2,"content":"these n-gram and N-1 gram probabilities?"},{"from":319.98,"to":322.3,"location":2,"content":"So, the answer is, we're going to get them by"},{"from":322.3,"to":325.05,"location":2,"content":"counting them in some large corpus of text."},{"from":325.05,"to":326.51,"location":2,"content":"So, we're going to approximate,"},{"from":326.51,"to":329.56,"location":2,"content":"these probabilities just by the count of the number of times that"},{"from":329.56,"to":334.41,"location":2,"content":"these particular n-grams and N-1 grams appeared in our training corpus."},{"from":334.41,"to":337.37,"location":2,"content":"Okay. So, here's an example with some words."},{"from":337.37,"to":340.56,"location":2,"content":"Suppose we are trying to learn a 4-gram language model,"},{"from":340.56,"to":342.83,"location":2,"content":"and suppose that we have a piece of text, that says,"},{"from":342.83,"to":344.54,"location":2,"content":"\"As the proctor started the clock,"},{"from":344.54,"to":346.1,"location":2,"content":"the students opened their blank\","},{"from":346.1,"to":348.89,"location":2,"content":"and we're trying to predict what word is coming next."},{"from":348.89,"to":351.74,"location":2,"content":"So, because we're learning a 4-gram language model,"},{"from":351.74,"to":355.91,"location":2,"content":"a simplifying assumption is that the next word depends only on the last three words,"},{"from":355.91,"to":357.61,"location":2,"content":"last N-1 words."},{"from":357.61,"to":361.52,"location":2,"content":"So, we're going to discard all of the context so far except for the last few words,"},{"from":361.52,"to":363.8,"location":2,"content":"which is, \"Students opened their.\""},{"from":363.8,"to":367.62,"location":2,"content":"So, as a reminder, n-gram language model says that,"},{"from":367.62,"to":368.94,"location":2,"content":"the probability of the next word being,"},{"from":368.94,"to":373.23,"location":2,"content":"some particular word W in the vocabulary is equal to the number of times we saw"},{"from":373.23,"to":375.51,"location":2,"content":"students opened their W divided by the number of"},{"from":375.51,"to":378.65,"location":2,"content":"times we saw students opened their, in the training corpus."},{"from":378.65,"to":381.44,"location":2,"content":"So, let's suppose that in our training corpus,"},{"from":381.44,"to":384.21,"location":2,"content":"we saw the phrase \"students open their\" 1,000 times."},{"from":384.21,"to":388.34,"location":2,"content":"And suppose that, we saw \"students opened their books\" 400 times."},{"from":388.34,"to":392.22,"location":2,"content":"This means that the probability of the next word being books is 0.4."},{"from":392.22,"to":396.81,"location":2,"content":"And uh, similarly, let's suppose that we saw students open their exams 100 times,"},{"from":396.81,"to":399.26,"location":2,"content":"this means that the probability of exams given students"},{"from":399.26,"to":401.93,"location":2,"content":"open their is 0.1. Is there a question?"},{"from":401.93,"to":404.9,"location":2,"content":"[inaudible]."},{"from":404.9,"to":407.01,"location":2,"content":"The question is, does the order of the words matter?"},{"from":407.01,"to":410.34,"location":2,"content":"And the answer is yes, the order of students open there does matter."},{"from":410.34,"to":413.19,"location":2,"content":"It's different to \"the students opened.\""},{"from":413.19,"to":416.99,"location":2,"content":"So, the question I want to raise now is,"},{"from":416.99,"to":420.81,"location":2,"content":"was it a good idea for us to discard the proctor context?"},{"from":420.81,"to":423.12,"location":2,"content":"If you look at the actual example that we had,"},{"from":423.12,"to":426.07,"location":2,"content":"the example was as the proctor started the clock,"},{"from":426.07,"to":427.85,"location":2,"content":"the students opened their blank."},{"from":427.85,"to":432.36,"location":2,"content":"So, do we think that books or exams is more likely given the actual context,"},{"from":432.36,"to":434.55,"location":2,"content":"the full context? Yep."},{"from":434.55,"to":435.45,"location":2,"content":"Exams."},{"from":435.45,"to":437.8,"location":2,"content":"Right. Exams is more likely because the proctor and"},{"from":437.8,"to":440.26,"location":2,"content":"the clock heavily implies that it's an exam scenario, so"},{"from":440.26,"to":442.63,"location":2,"content":"they're more likely to be opening the exams than the books,"},{"from":442.63,"to":444.4,"location":2,"content":"unless it's an open book exam."},{"from":444.4,"to":446.83,"location":2,"content":"Uh, but I think, overall, it should be exams."},{"from":446.83,"to":449.89,"location":2,"content":"So, the problem that we're seeing here is that in the training corpus,"},{"from":449.89,"to":451.24,"location":2,"content":"the fact that students were opening"},{"from":451.24,"to":453.99,"location":2,"content":"something means that it's more likely to be books than exams"},{"from":453.99,"to":456.31,"location":2,"content":"because overall, books are more common than exams."},{"from":456.31,"to":458.56,"location":2,"content":"But if we know that the context is,"},{"from":458.56,"to":461.08,"location":2,"content":"the proctor and the clock, then it should be exams."},{"from":461.08,"to":464.24,"location":2,"content":"So, what I'm highlighting here is a problem with our simplifying assumption."},{"from":464.24,"to":465.86,"location":2,"content":"If we throw away too much context,"},{"from":465.86,"to":470.45,"location":2,"content":"then we are not as good as predicting the words as we would be if we kept the context."},{"from":470.45,"to":474.69,"location":2,"content":"Okay. So, that's one problem with n-gram, uh, language models."},{"from":474.69,"to":476.81,"location":2,"content":"Uh, there are some other problems as well."},{"from":476.81,"to":480.47,"location":2,"content":"So, uh, here again is the equation that you saw before."},{"from":480.47,"to":481.88,"location":2,"content":"One problem which we're gonna call"},{"from":481.88,"to":485.46,"location":2,"content":"the sparsity problem is what happens if the number on top,"},{"from":485.46,"to":488.38,"location":2,"content":"the numerator, what if that count is equal to zero."},{"from":488.38,"to":491.21,"location":2,"content":"So, what if for some particular word W,"},{"from":491.21,"to":494.45,"location":2,"content":"the phrase students opened their W never occurred in the data."},{"from":494.45,"to":497.24,"location":2,"content":"So, for example, let's suppose students opened their petri dishes,"},{"from":497.24,"to":499.88,"location":2,"content":"is fairly uncommon and it never appears in the data,"},{"from":499.88,"to":504.36,"location":2,"content":"then that means our probability of the next word being petri dishes will be zero."},{"from":504.36,"to":507.39,"location":2,"content":"And this is bad, because it might be uncommon but it is,"},{"from":507.39,"to":509.38,"location":2,"content":"a valid scenario, right?"},{"from":509.38,"to":511.09,"location":2,"content":"If you're a biology student for example."},{"from":511.09,"to":514.09,"location":2,"content":"So, this is a problem and we call it the sparsity problem,"},{"from":514.09,"to":517.79,"location":2,"content":"because the problem is that if we'd never seen an event happen in the training data,"},{"from":517.79,"to":521.49,"location":2,"content":"then our model assigns zero probability to that event."},{"from":521.49,"to":526.41,"location":2,"content":"So, one partial solution to this problem is that maybe we should add a small delta,"},{"from":526.41,"to":528.29,"location":2,"content":"small number delta to the count,"},{"from":528.29,"to":530.42,"location":2,"content":"for every word in the vocabulary."},{"from":530.42,"to":533.92,"location":2,"content":"And then this way, every possible word that come next,"},{"from":533.92,"to":536.25,"location":2,"content":"has at least some small probability."},{"from":536.25,"to":539.09,"location":2,"content":"So, petri dishes will have some small probability,"},{"from":539.09,"to":542.41,"location":2,"content":"but then so, will all of the other words which are possibly bad choices."},{"from":542.41,"to":545.58,"location":2,"content":"So, this, uh, technique is called smoothing, because the idea is,"},{"from":545.58,"to":546.95,"location":2,"content":"you're going from a very, uh,"},{"from":546.95,"to":550.05,"location":2,"content":"sparse probability distribution, which is zero, almost everywhere,"},{"from":550.05,"to":551.55,"location":2,"content":"with a few spikes where there's,"},{"from":551.55,"to":553.45,"location":2,"content":"uh, being n-grams that we've seen,"},{"from":553.45,"to":556.1,"location":2,"content":"it goes from that to being a more smooth probability distribution"},{"from":556.1,"to":559.62,"location":2,"content":"where everything has at least a small probability on it."},{"from":559.62,"to":564.27,"location":2,"content":"So, the second sparsity problem which is possibly worse than the first one is,"},{"from":564.27,"to":568.13,"location":2,"content":"what happens if the number in the denominator is zero?"},{"from":568.13,"to":570.2,"location":2,"content":"So, in our example, that would mean,"},{"from":570.2,"to":574.65,"location":2,"content":"what if we never even saw the trigram \"students opened their\" in the training data."},{"from":574.65,"to":578.48,"location":2,"content":"If that happens, then we can't even calculate this probability distribution at"},{"from":578.48,"to":582.82,"location":2,"content":"all for any word W because we never even saw this context before."},{"from":582.82,"to":585.83,"location":2,"content":"So, a possible solution to this is that"},{"from":585.83,"to":588.45,"location":2,"content":"if you can't find \"students open their\" in the corpus,"},{"from":588.45,"to":591.94,"location":2,"content":"then you should back off to just conditioning on the last two words,"},{"from":591.94,"to":593.54,"location":2,"content":"rather than the last three words."},{"from":593.54,"to":595.9,"location":2,"content":"So, now you'd be looking at times when you'd seen,"},{"from":595.9,"to":598.46,"location":2,"content":"uh, \"open their\" and seeing what what's come next."},{"from":598.46,"to":601.35,"location":2,"content":"So, this is called back-off because in this failure case,"},{"from":601.35,"to":604.02,"location":2,"content":"for when you have no data for your 4-gram language model,"},{"from":604.02,"to":606.02,"location":2,"content":"you're backing off to a trigram language model."},{"from":606.02,"to":612.31,"location":2,"content":"Are there any questions at this point?"},{"from":612.31,"to":617.57,"location":2,"content":"Okay. So, um, another thing to note is that these sparsity problems"},{"from":617.57,"to":622.1,"location":2,"content":"get worse if you increase N. If you make N larger in your n-gram language model,"},{"from":622.1,"to":623.87,"location":2,"content":"and you might want to do this, for example,"},{"from":623.87,"to":626.39,"location":2,"content":"you might think, uh, I want to have a larger context,"},{"from":626.39,"to":628.57,"location":2,"content":"so I can pay attention to words that"},{"from":628.57,"to":630.89,"location":2,"content":"happened longer ago and that's gonna make it a better predictor."},{"from":630.89,"to":633.27,"location":2,"content":"So, you might think making N bigger is a good idea."},{"from":633.27,"to":636.41,"location":2,"content":"But the problem is if you do that then the sparsity problems get worse."},{"from":636.41,"to":637.7,"location":2,"content":"Because, let's suppose you say,"},{"from":637.7,"to":639.22,"location":2,"content":"I want a 10-gram language model."},{"from":639.22,"to":640.91,"location":2,"content":"Then the problem is that you're going to be counting,"},{"from":640.91,"to":643.48,"location":2,"content":"how often you seen process in 9-grams and 10-grams."},{"from":643.48,"to":645.49,"location":2,"content":"But 9-grams and 10-grams, there's so many of them,"},{"from":645.49,"to":647.62,"location":2,"content":"that the one you are interested in probably never occurred,"},{"from":647.62,"to":651.15,"location":2,"content":"in your training data which means that the whole thing becomes dysfunctional."},{"from":651.15,"to":656.17,"location":2,"content":"So, in practice, we usually can't have N much bigger than five."},{"from":656.17,"to":658.49,"location":2,"content":"Okay. So, that was, uh,"},{"from":658.49,"to":660.88,"location":2,"content":"two sparsity problems with n-gram language models."},{"from":660.88,"to":662.77,"location":2,"content":"Here is a problem with storage."},{"from":662.77,"to":664.71,"location":2,"content":"So, if we look at this equation, uh,"},{"from":664.71,"to":666.78,"location":2,"content":"you have to think about what do you need to"},{"from":666.78,"to":669.37,"location":2,"content":"store in order to use your n-gram language model."},{"from":669.37,"to":672.02,"location":2,"content":"You need to store this count number,"},{"from":672.02,"to":674.09,"location":2,"content":"for all of the n-grams that you observed in"},{"from":674.09,"to":677.22,"location":2,"content":"the corpus when you were going through the training corpus counting them."},{"from":677.22,"to":679.44,"location":2,"content":"And the problem is, that as you increase N,"},{"from":679.44,"to":683.48,"location":2,"content":"then this number of n-grams that you have to store and count increases."},{"from":683.48,"to":687.51,"location":2,"content":"So, another problem with increasing N is that the size of your model,"},{"from":687.51,"to":691.49,"location":2,"content":"or your n-gram model, uh, gets bigger."},{"from":691.49,"to":697.22,"location":2,"content":"Okay, so n-gram Language Models in practice. Let's look at an example."},{"from":697.22,"to":702.54,"location":2,"content":"You can actually build a simple trigram Language Model over a 1.7 million word corpus,"},{"from":702.54,"to":704.33,"location":2,"content":"uh, in a few seconds on your laptop."},{"from":704.33,"to":706.14,"location":2,"content":"And in fact, the corpus that I used to do this"},{"from":706.14,"to":707.97,"location":2,"content":"was the same one that you met in assignment one."},{"from":707.97,"to":709.61,"location":2,"content":"It's Reuters' corpus which is,"},{"from":709.61,"to":711.18,"location":2,"content":"uh, business and financial news."},{"from":711.18,"to":712.38,"location":2,"content":"So, if you want to do this yourself,"},{"from":712.38,"to":715,"location":2,"content":"you can follow that link at the bottom of the slide later."},{"from":715,"to":717,"location":2,"content":"So, uh, this is, uh,"},{"from":717,"to":719.28,"location":2,"content":"something which I ran on my laptop in a few second."},{"from":719.28,"to":722.79,"location":2,"content":"So I gave it the context of the bigram today the,"},{"from":722.79,"to":726.48,"location":2,"content":"and then I asked the trigram Language Model what word is likely to come next."},{"from":726.48,"to":729.86,"location":2,"content":"So, the Language Model said that the top next most likely words are"},{"from":729.86,"to":733.46,"location":2,"content":"company, bank, price, Italian, emirate, et cetera."},{"from":733.46,"to":737.64,"location":2,"content":"So already just looking at these probabilities that are assigned to these different words,"},{"from":737.64,"to":739.59,"location":2,"content":"uh, you can see that there is a sparsity problem."},{"from":739.59,"to":741.84,"location":2,"content":"For example, the top two most likely words have"},{"from":741.84,"to":744.72,"location":2,"content":"the exact same probability and the reason for that is,"},{"from":744.72,"to":746.76,"location":2,"content":"that this number is 4 over 26."},{"from":746.76,"to":748.8,"location":2,"content":"So these are quite small integers, uh,"},{"from":748.8,"to":750.27,"location":2,"content":"meaning that we only saw, uh,"},{"from":750.27,"to":753,"location":2,"content":"today the company and today the bank four times each."},{"from":753,"to":754.56,"location":2,"content":"So, uh, this is an example of"},{"from":754.56,"to":757.29,"location":2,"content":"the sparsity problem because overall these are quite low counts,"},{"from":757.29,"to":759.16,"location":2,"content":"we haven't seen that many different, uh,"},{"from":759.16,"to":760.5,"location":2,"content":"versions of this event,"},{"from":760.5,"to":763.88,"location":2,"content":"so we don't have a very granular probability distribution."},{"from":763.88,"to":766.38,"location":2,"content":"But in any case ignoring the sparsity problem,"},{"from":766.38,"to":767.76,"location":2,"content":"I would say that overall,"},{"from":767.76,"to":772.6,"location":2,"content":"these, uh, top suggestions look pretty reasonable."},{"from":772.6,"to":775.67,"location":2,"content":"So you can actually use a Language Model to"},{"from":775.67,"to":778.3,"location":2,"content":"generate text and this is how you would do it."},{"from":778.3,"to":780.74,"location":2,"content":"So let's suppose you have your first two words already, uh,"},{"from":780.74,"to":784.56,"location":2,"content":"you condition on this and you ask your Language Model what's likely to come next."},{"from":784.56,"to":787.3,"location":2,"content":"So then given this probability distribution over the words,"},{"from":787.3,"to":788.85,"location":2,"content":"you can sample from it, that is,"},{"from":788.85,"to":791.87,"location":2,"content":"select some words with, you know, the associated probability."},{"from":791.87,"to":794.24,"location":2,"content":"So let's suppose that gives us the word price."},{"from":794.24,"to":797.73,"location":2,"content":"So then price is your next word, and then you just condition on the last two words,"},{"from":797.73,"to":800.38,"location":2,"content":"which in this ex- example is now the price."},{"from":800.38,"to":803.79,"location":2,"content":"So now you get a new probability distribution and you can continue this process,"},{"from":803.79,"to":807.96,"location":2,"content":"uh, sampling and then conditioning again and sampling."},{"from":807.96,"to":810.15,"location":2,"content":"So if you do this long enough,"},{"from":810.15,"to":811.35,"location":2,"content":"you will get a piece of text,"},{"from":811.35,"to":813.69,"location":2,"content":"so this is the actual text that I got when"},{"from":813.69,"to":817,"location":2,"content":"I run this generation process with this trigram Language Model."},{"from":817,"to":820.26,"location":2,"content":"So it says, \"Today the price of gold per ton,"},{"from":820.26,"to":823.26,"location":2,"content":"while production of shoe lasts and shoe industry,"},{"from":823.26,"to":826.23,"location":2,"content":"the bank intervened just after it considered and rejected"},{"from":826.23,"to":829.37,"location":2,"content":"an IMF demand to rebuild depleted European stocks,"},{"from":829.37,"to":832.81,"location":2,"content":"September, 30th end primary 76 counts a share.''"},{"from":832.81,"to":835.25,"location":2,"content":"Okay. So, uh, what do we think about this text?"},{"from":835.25,"to":839.2,"location":2,"content":"We think it's good? We, uh, surprised?"},{"from":839.2,"to":842.37,"location":2,"content":"Um, I would say that in some ways it is good,"},{"from":842.37,"to":844.62,"location":2,"content":"it's kind of surprisingly grammatical, you know,"},{"from":844.62,"to":847.86,"location":2,"content":"it mostly, uh, kind of pauses,"},{"from":847.86,"to":849.15,"location":2,"content":"uh, but you would definitely say that it,"},{"from":849.15,"to":850.5,"location":2,"content":"it doesn't really make any sense."},{"from":850.5,"to":852.18,"location":2,"content":"It's pretty incoherent."},{"from":852.18,"to":854.58,"location":2,"content":"And we shouldn't be surprised that it's incoherent I"},{"from":854.58,"to":857.72,"location":2,"content":"think because if you remember this is a trigram Language Model,"},{"from":857.72,"to":860.26,"location":2,"content":"it has a memory of just the last well,"},{"from":860.26,"to":862.63,"location":2,"content":"three or two words depending on how you look at it."},{"from":862.63,"to":864.51,"location":2,"content":"So clearly we need to consider"},{"from":864.51,"to":867.99,"location":2,"content":"more than three words at a time if we want to model language well."},{"from":867.99,"to":872.26,"location":2,"content":"But as we already know, increasing n makes the sparsity problem worse,"},{"from":872.26,"to":878.37,"location":2,"content":"n-gram Language Models, and it also increases model size. Is that a question?"},{"from":878.37,"to":880.32,"location":2,"content":"How does it [inaudible] [NOISE]"},{"from":880.32,"to":883.38,"location":2,"content":"So the question is, how does the n-gram Language Model know when to put commas."},{"from":883.38,"to":885.15,"location":2,"content":"Uh, so you can,"},{"from":885.15,"to":890.4,"location":2,"content":"[NOISE] decide that commas and other punctuation are just another kind of word,"},{"from":890.4,"to":891.71,"location":2,"content":"is that well or token,"},{"from":891.71,"to":894.51,"location":2,"content":"and then, to the Language Model it doesn't really make much difference."},{"from":894.51,"to":897.71,"location":2,"content":"It's just used that as another possible world that can be, um, predicted,"},{"from":897.71,"to":899.45,"location":2,"content":"that's why we've got the weird spacing around the,"},{"from":899.45,"to":901.77,"location":2,"content":"the commas is because it was essentially viewed as a separate word."},{"from":901.77,"to":906.13,"location":2,"content":"[NOISE] Okay."},{"from":906.13,"to":909.2,"location":2,"content":"So this course is called NLP with Deep Learning."},{"from":909.2,"to":912.76,"location":2,"content":"So you probably thinking how do we build a neural Language Model?"},{"from":912.76,"to":915.45,"location":2,"content":"So let's just recap, uh, in case you forgot."},{"from":915.45,"to":917.94,"location":2,"content":"Remember that a Language Model is something that takes"},{"from":917.94,"to":920.76,"location":2,"content":"inputs which is a sequence of words X1 up to Xt,"},{"from":920.76,"to":927.47,"location":2,"content":"and then it outputs a probability distribution of what the next word might be Xt plus 1."},{"from":927.47,"to":932.07,"location":2,"content":"Okay, so when we think about what kind of neural models we've met in this course so far."},{"from":932.07,"to":934.54,"location":2,"content":"Uh, we've already met window-based neural models."},{"from":934.54,"to":936.78,"location":2,"content":"And in lecture three, we saw how you could apply"},{"from":936.78,"to":940.03,"location":2,"content":"a window-based neural model to a named entity recognition."},{"from":940.03,"to":943.05,"location":2,"content":"So in that scenario you take some kind of window around the word that you"},{"from":943.05,"to":946.13,"location":2,"content":"care about which in this example is Paris, and then, uh,"},{"from":946.13,"to":948.78,"location":2,"content":"you get the word embeddings for those, concatenate them put them through"},{"from":948.78,"to":952.89,"location":2,"content":"some layers, and then you get your decision which is that Paris is a location not,"},{"from":952.89,"to":955.42,"location":2,"content":"you know, a person or organization."},{"from":955.42,"to":957.9,"location":2,"content":"So that's a recap of what we saw in lecture three."},{"from":957.9,"to":963.79,"location":2,"content":"How would we apply a model like this to language modeling? So here's how you would do it."},{"from":963.79,"to":966.93,"location":2,"content":"Here's an example of a fixed-window neural language model."},{"from":966.93,"to":969.42,"location":2,"content":"So, again, we have some kind of context"},{"from":969.42,"to":972.06,"location":2,"content":"which is, as the proctor started the clock the students opened their,"},{"from":972.06,"to":975.23,"location":2,"content":"um, we're trying to guess what word might come next."},{"from":975.23,"to":978.45,"location":2,"content":"So we have to make a similar simplifying assumption to before."},{"from":978.45,"to":981.25,"location":2,"content":"Uh, because it's a fixed size window, uh,"},{"from":981.25,"to":985.5,"location":2,"content":"we have to discard the context except for the window that we're conditioning on."},{"from":985.5,"to":989.07,"location":2,"content":"So let's suppose that our fixed window is of size four."},{"from":989.07,"to":994.39,"location":2,"content":"So what we'll do is similarly to the, ah, NER model."},{"from":994.39,"to":998.4,"location":2,"content":"We're going to represent these words with one-hot vectors,"},{"from":998.4,"to":1002.75,"location":2,"content":"and then we'll use those to look up the word embeddings for these words using the,"},{"from":1002.75,"to":1004.89,"location":2,"content":"uh, embedding lookup matrix."},{"from":1004.89,"to":1008.08,"location":2,"content":"So then we get all of our word embeddings E,1, 2, 3, 4,"},{"from":1008.08,"to":1011.27,"location":2,"content":"and then we concatenate them together to get e. We put this through"},{"from":1011.27,"to":1015.22,"location":2,"content":"a linear layer and a nonlinearity function f to get some kind of hidden layer,"},{"from":1015.22,"to":1017.72,"location":2,"content":"and then we put it through another linear layer and"},{"from":1017.72,"to":1021.86,"location":2,"content":"the softmax function and now we have an output probability distribution y hat."},{"from":1021.86,"to":1025.92,"location":2,"content":"And in our case because we're trying to predict what word comes next, ah, ah,"},{"from":1025.92,"to":1028.43,"location":2,"content":"vector y hat will be of length v where v is"},{"from":1028.43,"to":1030.02,"location":2,"content":"the vocabulary and it will contain"},{"from":1030.02,"to":1032.56,"location":2,"content":"the probabilities of all the different words in the vocabulary."},{"from":1032.56,"to":1035.6,"location":2,"content":"So here I've represented that as a bar charts where if you suppose"},{"from":1035.6,"to":1038.69,"location":2,"content":"you've got all of the words listed alphabetically from a to z,"},{"from":1038.69,"to":1041.3,"location":2,"content":"and then there's the different probabilities of the words."},{"from":1041.3,"to":1042.85,"location":2,"content":"So if everything goes well,"},{"from":1042.85,"to":1044.48,"location":2,"content":"then this language model should tell us that"},{"from":1044.48,"to":1047.93,"location":2,"content":"some likely next words are books and laptops, for example."},{"from":1047.93,"to":1049.94,"location":2,"content":"So none of this should be, um,"},{"from":1049.94,"to":1051.77,"location":2,"content":"unfamiliar to you because you saw it all last week."},{"from":1051.77,"to":1056.47,"location":2,"content":"We're just applying a Window-based model to a different task, such as language modeling."},{"from":1056.47,"to":1058.94,"location":2,"content":"Okay, so what are,"},{"from":1058.94,"to":1062.24,"location":2,"content":"some good things about this model compared to n-gram language models?"},{"from":1062.24,"to":1066.31,"location":2,"content":"So one, ah, advantage I'd say is that there's no sparsity problem."},{"from":1066.31,"to":1069.69,"location":2,"content":"If you remember an n-gram language model has a sparsity problem"},{"from":1069.69,"to":1073.2,"location":2,"content":"which is that if you've never seen a particular n-gram in training then,"},{"from":1073.2,"to":1075.01,"location":2,"content":"you can't assign any probability to it."},{"from":1075.01,"to":1076.44,"location":2,"content":"You don't have any data on it."},{"from":1076.44,"to":1079.34,"location":2,"content":"Whereas at least here you can take any, you know, for example,"},{"from":1079.34,"to":1082.12,"location":2,"content":"4-gram you want and you can feed it into the, ah,"},{"from":1082.12,"to":1083.8,"location":2,"content":"the neural nets and it will give you"},{"from":1083.8,"to":1086.15,"location":2,"content":"an output distribution of what it thinks the next word would be."},{"from":1086.15,"to":1090.24,"location":2,"content":"It might not be a good prediction but at least it will, it will run."},{"from":1090.24,"to":1092.93,"location":2,"content":"Another advantage is you don't need to store"},{"from":1092.93,"to":1095.09,"location":2,"content":"all of the observed n-grams that you ever saw."},{"from":1095.09,"to":1097.28,"location":2,"content":"So, uh, this an advantage by, uh,"},{"from":1097.28,"to":1099.23,"location":2,"content":"comparison you just have to store"},{"from":1099.23,"to":1102.15,"location":2,"content":"all of the word vectors for all the words in your vocabulary."},{"from":1102.15,"to":1106.09,"location":2,"content":"Uh, but there are quite a lot of problems with this fixed-window language model."},{"from":1106.09,"to":1109.16,"location":2,"content":"So here are some remaining problems: Uh,"},{"from":1109.16,"to":1111.47,"location":2,"content":"one is that your fixed window is probably too small."},{"from":1111.47,"to":1113.88,"location":2,"content":"No matter how big you make your fixed window, uh,"},{"from":1113.88,"to":1115.64,"location":2,"content":"you're probably going to be losing some kind of"},{"from":1115.64,"to":1118.49,"location":2,"content":"useful context that you would want to use sometimes."},{"from":1118.49,"to":1121.74,"location":2,"content":"And in fact, if you try to enlarge the window size,"},{"from":1121.74,"to":1124.17,"location":2,"content":"then you also have to enlarge the size of your,"},{"from":1124.17,"to":1125.48,"location":2,"content":"uh, weight factor, sorry,"},{"from":1125.48,"to":1127.58,"location":2,"content":"your weight matrix W. Uh,"},{"from":1127.58,"to":1129.59,"location":2,"content":"so the width of W because you're multiplying it"},{"from":1129.59,"to":1132.11,"location":2,"content":"by e which is the concatenation of your word embeddings."},{"from":1132.11,"to":1136.39,"location":2,"content":"The width of W grows as you increase the size of your window."},{"from":1136.39,"to":1141.28,"location":2,"content":"So in inclusion really your window can never be large enough."},{"from":1141.28,"to":1145.46,"location":2,"content":"Another problem with this model which is more of a subtle point is that"},{"from":1145.46,"to":1148.82,"location":2,"content":"X1 and X2 and really all of the words in the window they're,"},{"from":1148.82,"to":1151.1,"location":2,"content":"uh, multiplied by completely diffe rent weights in"},{"from":1151.1,"to":1154.57,"location":2,"content":"W. So to demonstrate this you could draw a picture."},{"from":1154.57,"to":1157.61,"location":2,"content":"So the problem is that if you have"},{"from":1157.61,"to":1161.72,"location":2,"content":"your weight matrix W and then you have"},{"from":1161.72,"to":1166.91,"location":2,"content":"your concatenation of embeddings e and we have, uh, four embeddings."},{"from":1166.91,"to":1170.39,"location":2,"content":"So we have e_1, e_2, e_3,"},{"from":1170.39,"to":1173.13,"location":2,"content":"e_4, and you multiply, uh,"},{"from":1173.13,"to":1176.62,"location":2,"content":"the concatenated embeddings by the weight matrix."},{"from":1176.62,"to":1179.12,"location":2,"content":"So really you can see that there are essentially"},{"from":1179.12,"to":1182.45,"location":2,"content":"kind of four sections of the weight matrix,"},{"from":1182.45,"to":1185.57,"location":2,"content":"and the first word embedding e_1 is only"},{"from":1185.57,"to":1188.83,"location":2,"content":"ever multiplied by the weights for it in this section,"},{"from":1188.83,"to":1193.03,"location":2,"content":"and that's completely separate to the weights that multiply by e_2 and so forth."},{"from":1193.03,"to":1196.7,"location":2,"content":"So the problem with this is that what you"},{"from":1196.7,"to":1200.06,"location":2,"content":"learn in the weight matrix in one section is not shared with the others."},{"from":1200.06,"to":1203.98,"location":2,"content":"You're kind of learning a lot of similar functions four times."},{"from":1203.98,"to":1207.91,"location":2,"content":"So the reason why we think this is a problem is because there should be a lot of"},{"from":1207.91,"to":1212.13,"location":2,"content":"commonalities in how you process the incoming word embeddings."},{"from":1212.13,"to":1214.88,"location":2,"content":"So what you learn about how to process, you know,"},{"from":1214.88,"to":1218.38,"location":2,"content":"the third embedding, some of it at least should be shared with all of the embeddings."},{"from":1218.38,"to":1221.96,"location":2,"content":"So what I'm saying is it's kind of inefficient that we're learning, uh,"},{"from":1221.96,"to":1224.3,"location":2,"content":"all of these separate weights for these different words"},{"from":1224.3,"to":1229.84,"location":2,"content":"when there's a lot of commonalities between them. Is there a question?"},{"from":1229.84,"to":1231.18,"location":2,"content":"So that's why [inaudible] [NOISE]."},{"from":1231.18,"to":1231.96,"location":2,"content":"Okay-"},{"from":1231.96,"to":1238.28,"location":2,"content":"Yeah, hopefully- hopefully the verbal description is on."},{"from":1238.28,"to":1242.31,"location":2,"content":"So, in conclusion, I'd say that the biggest problem that we've got with"},{"from":1242.31,"to":1245.28,"location":2,"content":"this fixed-size neural model is that clearly we"},{"from":1245.28,"to":1248.36,"location":2,"content":"need some kind of neural architecture that can process any length input,"},{"from":1248.36,"to":1251.07,"location":2,"content":"because most of the problems here come from the fact that we had to make"},{"from":1251.07,"to":1256.67,"location":2,"content":"this simplifying assumption that there was a fixed window."},{"from":1256.67,"to":1260.04,"location":2,"content":"Okay. So this motivates, uh,"},{"from":1260.04,"to":1262.59,"location":2,"content":"us to introduce this new family of neural architecture,"},{"from":1262.59,"to":1265.52,"location":2,"content":"it's called recurrent neural networks or RNNs."},{"from":1265.52,"to":1269.1,"location":2,"content":"So, this is a simplified diagram that shows you the most important,"},{"from":1269.1,"to":1271.32,"location":2,"content":"um, features of an RNN."},{"from":1271.32,"to":1275.07,"location":2,"content":"So we have again an input sequence of X1, X2,"},{"from":1275.07,"to":1280.24,"location":2,"content":"et cetera, but you can assume that this sequence is of any arbitrary length you like."},{"from":1280.24,"to":1284.46,"location":2,"content":"The idea is that you have a sequence of hidden states instead of just having,"},{"from":1284.46,"to":1287.17,"location":2,"content":"for example, one hidden state as we did in the previous model."},{"from":1287.17,"to":1290.94,"location":2,"content":"We have a sequence of hidden states and we have as many of them as we have inputs."},{"from":1290.94,"to":1295.44,"location":2,"content":"And the important thing is that each hidden state ht is computed based"},{"from":1295.44,"to":1300.32,"location":2,"content":"on the previous hidden state and also the input on that step."},{"from":1300.32,"to":1304.05,"location":2,"content":"So the reason why they're called hidden states is because you could think of"},{"from":1304.05,"to":1307.42,"location":2,"content":"this as a single state that's mutating over time."},{"from":1307.42,"to":1310.26,"location":2,"content":"It's kind of like several versions of the same thing."},{"from":1310.26,"to":1313.83,"location":2,"content":"And for this reason, we often call these time-steps, right?"},{"from":1313.83,"to":1315.54,"location":2,"content":"So these steps that go left to right,"},{"from":1315.54,"to":1318.95,"location":2,"content":"we often call them time-steps."},{"from":1318.95,"to":1321.87,"location":2,"content":"So the really important thing is that"},{"from":1321.87,"to":1327.21,"location":2,"content":"the same weight matrix W is applied on every time-step of this RNN."},{"from":1327.21,"to":1331.37,"location":2,"content":"That's what makes us able to process any length input we want."},{"from":1331.37,"to":1333.93,"location":2,"content":"Is because we don't have to have different weights on every step,"},{"from":1333.93,"to":1338.87,"location":2,"content":"because we just apply the exact same transformation on every step."},{"from":1338.87,"to":1342.69,"location":2,"content":"So additionally, you can also have some outputs from the RNN."},{"from":1342.69,"to":1343.99,"location":2,"content":"So these y hats,"},{"from":1343.99,"to":1346.15,"location":2,"content":"these are the outputs on each step."},{"from":1346.15,"to":1348.73,"location":2,"content":"And they're optional because you don't have to compute them"},{"from":1348.73,"to":1351.21,"location":2,"content":"or you can compute them on just some steps and not others."},{"from":1351.21,"to":1354.92,"location":2,"content":"It depends on where you want to use your RNN to do."},{"from":1354.92,"to":1358.26,"location":2,"content":"Okay. So that's a simple diagram of an RNN."},{"from":1358.26,"to":1359.85,"location":2,"content":"Uh, here I'm going to give you a bit more detail."},{"from":1359.85,"to":1363.63,"location":2,"content":"So here's how you would apply an RNN to do language modeling."},{"from":1363.63,"to":1368.17,"location":2,"content":"So, uh, again, let's suppose that we have some kind of text so far."},{"from":1368.17,"to":1370.86,"location":2,"content":"My text is only four words long,"},{"from":1370.86,"to":1373.32,"location":2,"content":"but you can assume that it could be any length, right?"},{"from":1373.32,"to":1375.42,"location":2,"content":"It's just short because we can't fit more on the slide."},{"from":1375.42,"to":1378.39,"location":2,"content":"So you have some sequence of tags, which could be kind of long."},{"from":1378.39,"to":1382.02,"location":2,"content":"And again, we're going to represent these by some kind of one-hot vectors and"},{"from":1382.02,"to":1386.46,"location":2,"content":"use those to look up the word embeddings from our embedding matrix."},{"from":1386.46,"to":1390.37,"location":2,"content":"So then to compute the first hidden state H1,"},{"from":1390.37,"to":1394.3,"location":2,"content":"we need to compute it based on the previous hidden state and the current input."},{"from":1394.3,"to":1396.62,"location":2,"content":"We already have the current input, that's E1,"},{"from":1396.62,"to":1399.57,"location":2,"content":"but the question is where do we get this first hidden state from?"},{"from":1399.57,"to":1401.16,"location":2,"content":"All right, what comes before H1?"},{"from":1401.16,"to":1404.67,"location":2,"content":"So we often call the initial hidden state H0, uh, yes,"},{"from":1404.67,"to":1408.02,"location":2,"content":"we call the initial hidden state and it can either be something that you learn,"},{"from":1408.02,"to":1412.07,"location":2,"content":"like it's a parameter of the network and you learn how to initialize it,"},{"from":1412.07,"to":1415.39,"location":2,"content":"or you can assume something like maybe it's the zero vector."},{"from":1415.39,"to":1420.49,"location":2,"content":"So the formula we use to compute the new hidden state based on the previous one,"},{"from":1420.49,"to":1423.19,"location":2,"content":"and also the current inputs is written on the left."},{"from":1423.19,"to":1426.69,"location":2,"content":"So you do a linear transformation on the previous hidden state and on"},{"from":1426.69,"to":1428.64,"location":2,"content":"the current input and then you add some kind of"},{"from":1428.64,"to":1430.92,"location":2,"content":"bias and then put it through a non-linearity,"},{"from":1430.92,"to":1432.99,"location":2,"content":"like for example, the sigmoid function."},{"from":1432.99,"to":1436.67,"location":2,"content":"And that gives you a new hidden state."},{"from":1436.67,"to":1439.47,"location":2,"content":"Okay. So, once you've done that,"},{"from":1439.47,"to":1441.48,"location":2,"content":"then you can compute the next hidden state and you"},{"from":1441.48,"to":1443.85,"location":2,"content":"can keep unrolling the network like this."},{"from":1443.85,"to":1446.03,"location":2,"content":"And that's, uh, yeah,"},{"from":1446.03,"to":1447.45,"location":2,"content":"that's called unrolling because you're kind of"},{"from":1447.45,"to":1450.27,"location":2,"content":"computing each step given the previous one."},{"from":1450.27,"to":1452.16,"location":2,"content":"All right. So finally, if you remember,"},{"from":1452.16,"to":1453.33,"location":2,"content":"we're trying to do language modeling."},{"from":1453.33,"to":1457.53,"location":2,"content":"So we're trying to predict which words should come next after the students opened their."},{"from":1457.53,"to":1459.87,"location":2,"content":"So on this fourth step over here,"},{"from":1459.87,"to":1461.2,"location":2,"content":"we can use, uh,"},{"from":1461.2,"to":1462.83,"location":2,"content":"the current hidden state, H4,"},{"from":1462.83,"to":1467.43,"location":2,"content":"and put it through a linear layer and put it through a softmax function and then we get"},{"from":1467.43,"to":1472.8,"location":2,"content":"our output distribution Y-hat 4 which is a distribution over the vocabulary."},{"from":1472.8,"to":1474.72,"location":2,"content":"And again, hopefully, we'll get some kind of"},{"from":1474.72,"to":1478.08,"location":2,"content":"sensible estimates for what the next word might be."},{"from":1478.08,"to":1483.21,"location":2,"content":"Any questions at this point. Yep?"},{"from":1483.21,"to":1487.65,"location":2,"content":"Is the- the number of hidden state or is it gonna be the number of words in your input?"},{"from":1487.65,"to":1490.85,"location":2,"content":"The question is, is the number of hidden states the number of words in your input?"},{"from":1490.85,"to":1493.48,"location":2,"content":"Yeah, in this setting here, uh, yes,"},{"from":1493.48,"to":1498.4,"location":2,"content":"or you could say more generally the number of hidden states is the number of inputs. Yep."},{"from":1498.4,"to":1499.95,"location":2,"content":"And just as with the n-gram model,"},{"from":1499.95,"to":1505.59,"location":2,"content":"we could use the output as the input from the tasks mutation in transformational model?"},{"from":1505.59,"to":1507,"location":2,"content":"Yeah, so the question is,"},{"from":1507,"to":1508.65,"location":2,"content":"as with the n-gram language model,"},{"from":1508.65,"to":1510.57,"location":2,"content":"could we use the output as the input on the next step?"},{"from":1510.57,"to":1512.71,"location":2,"content":"And the answer is yes, and I'll show you that in a minute."},{"from":1512.71,"to":1515.7,"location":2,"content":"Any other questions? Yeah."},{"from":1515.7,"to":1517.99,"location":2,"content":"Are you learning the embedding?"},{"from":1517.99,"to":1520.56,"location":2,"content":"The question is, are you learning the embeddings?"},{"from":1520.56,"to":1521.92,"location":2,"content":"Um, that's a choice."},{"from":1521.92,"to":1523.77,"location":2,"content":"You could have the embeddings be for example,"},{"from":1523.77,"to":1527.37,"location":2,"content":"pre-generated embeddings that you download and you use those and they're frozen,"},{"from":1527.37,"to":1528.75,"location":2,"content":"or maybe you could download them,"},{"from":1528.75,"to":1530.19,"location":2,"content":"but then you could fine-tune them."},{"from":1530.19,"to":1532.2,"location":2,"content":"That is, allow them to be changed as parameters of"},{"from":1532.2,"to":1535.17,"location":2,"content":"the network or you could initialize them to,"},{"from":1535.17,"to":1538.56,"location":2,"content":"you know, small, uh, random values and learn them from scratch."},{"from":1538.56,"to":1540.57,"location":2,"content":"Any other questions? Yeah."},{"from":1540.57,"to":1543.69,"location":2,"content":"So you said you use the same delta matrix,"},{"from":1543.69,"to":1545.49,"location":2,"content":"like you do back propagation,"},{"from":1545.49,"to":1548.03,"location":2,"content":"does that you only update like WE,"},{"from":1548.03,"to":1551.08,"location":2,"content":"or do you update both WH and WE?"},{"from":1551.08,"to":1556.09,"location":2,"content":"So the question is, you say we reuse the matrix, do we update WE and WH, or just one?"},{"from":1556.09,"to":1558.98,"location":2,"content":"So you suddenly learn both WE and WH."},{"from":1558.98,"to":1561.41,"location":2,"content":"I suppose I was emphasizing WH more, but yeah,"},{"from":1561.41,"to":1564.09,"location":2,"content":"they're both matrices that are applied repeatedly."},{"from":1564.09,"to":1565.5,"location":2,"content":"There was also a question about back-prop,"},{"from":1565.5,"to":1567.67,"location":2,"content":"but we're going to cover that later in this lecture."},{"from":1567.67,"to":1572.25,"location":2,"content":"Okay, moving on for now. Um, so,"},{"from":1572.25,"to":1577.53,"location":2,"content":"what are some advantages and disadvantages of this RNN language model?"},{"from":1577.53,"to":1583.01,"location":2,"content":"So here are some advantages that we can see in comparison to the fixed window one."},{"from":1583.01,"to":1588.21,"location":2,"content":"So an obvious advantage is that this RNN can process any length of input."},{"from":1588.21,"to":1591.18,"location":2,"content":"Another advantage is that the computation for"},{"from":1591.18,"to":1595.05,"location":2,"content":"step t can in theory use information from many steps back."},{"from":1595.05,"to":1596.73,"location":2,"content":"So in our motivation example,"},{"from":1596.73,"to":1598.65,"location":2,"content":"which was as the proctor started the clock,"},{"from":1598.65,"to":1599.97,"location":2,"content":"the students opened their."},{"from":1599.97,"to":1602.25,"location":2,"content":"We think that proctor and maybe clock are"},{"from":1602.25,"to":1605.34,"location":2,"content":"both pretty important hints for what might be coming up next."},{"from":1605.34,"to":1607.28,"location":2,"content":"So, at least in theory,"},{"from":1607.28,"to":1609.39,"location":2,"content":"the hidden state at the end"},{"from":1609.39,"to":1615.35,"location":2,"content":"can have access to the information from the input from many steps ago."},{"from":1615.35,"to":1619.79,"location":2,"content":"Another advantage is that the model size doesn't increase for longer inputs."},{"from":1619.79,"to":1622.48,"location":2,"content":"So, uh, the size of the model is actually fixed."},{"from":1622.48,"to":1625.01,"location":2,"content":"It's just WH and WE,s"},{"from":1625.01,"to":1629.4,"location":2,"content":"and then also the biases and also the embedding matrix, if you're counting that."},{"from":1629.4,"to":1633,"location":2,"content":"None of those get bigger if you want to apply it to more,"},{"from":1633,"to":1638.03,"location":2,"content":"uh, longer inputs because you just apply the same weights repeatedly."},{"from":1638.03,"to":1643.99,"location":2,"content":"And another advantage is that you have the same weights applied on every time-step."},{"from":1643.99,"to":1649.42,"location":2,"content":"So I said this thing before about how the fixed-sized window neural model,"},{"from":1649.42,"to":1651.72,"location":2,"content":"it was less efficient because it was applying"},{"from":1651.72,"to":1654.27,"location":2,"content":"different weights of the weight matrix to the different,"},{"from":1654.27,"to":1655.9,"location":2,"content":"uh, words in the window."},{"from":1655.9,"to":1658.47,"location":2,"content":"And the advantage about this RNN is that it's"},{"from":1658.47,"to":1661.65,"location":2,"content":"applying the exact same transformation to each of the inputs."},{"from":1661.65,"to":1665.84,"location":2,"content":"So this means that if it learns a good way to process one input,"},{"from":1665.84,"to":1668.01,"location":2,"content":"that is applied to every input in the sequence."},{"from":1668.01,"to":1671.48,"location":2,"content":"So you can see it as more efficient in that way."},{"from":1671.48,"to":1674.81,"location":2,"content":"Okay, so what are the disadvantages of this model?"},{"from":1674.81,"to":1678.27,"location":2,"content":"One is that recurrent computation is pretty slow."},{"from":1678.27,"to":1679.99,"location":2,"content":"Uh, as you saw before,"},{"from":1679.99,"to":1683.87,"location":2,"content":"you have to compute the hidden state based on the previous hidden state."},{"from":1683.87,"to":1686.92,"location":2,"content":"So this means that you can't compute all of the hidden states in parallel."},{"from":1686.92,"to":1688.66,"location":2,"content":"You have to compute them in sequence."},{"from":1688.66,"to":1693.12,"location":2,"content":"So, especially if you're trying to compute an RNN over a pretty long sequence of inputs,"},{"from":1693.12,"to":1696.66,"location":2,"content":"this means that the RNN can be pretty slow to compute."},{"from":1696.66,"to":1700.42,"location":2,"content":"Another disadvantage of RNNs is that it tuns out,"},{"from":1700.42,"to":1704.17,"location":2,"content":"in practice, it's quite difficult to access information from many steps back."},{"from":1704.17,"to":1706.29,"location":2,"content":"So even though I said we should be able to remember about"},{"from":1706.29,"to":1708.93,"location":2,"content":"the proctor and the clock and use that to predict exams and our books,"},{"from":1708.93,"to":1710.43,"location":2,"content":"it turns out that RNNs,"},{"from":1710.43,"to":1712.47,"location":2,"content":"at least the ones that I've presented in this lecture,"},{"from":1712.47,"to":1715.31,"location":2,"content":"are not as good as that as you would think."},{"from":1715.31,"to":1719.3,"location":2,"content":"Um, we're gonna learn more about both of these disadvantages later in the course,"},{"from":1719.3,"to":1722.61,"location":2,"content":"and we're going to learn something about how you can try to fix them."},{"from":1722.61,"to":1726.9,"location":2,"content":"Have we gotten any questions at this point? Yep."},{"from":1726.9,"to":1728.01,"location":2,"content":"Why do we assume that WH are the same?"},{"from":1728.01,"to":1731.27,"location":2,"content":"Sorry, can you speak up?"},{"from":1731.27,"to":1735.9,"location":2,"content":"Why do we assume that the WH should be the same?"},{"from":1735.9,"to":1739.63,"location":2,"content":"So the question is, why should you assume that the WH are the same?"},{"from":1739.63,"to":1741.45,"location":2,"content":"I suppose, it's not exactly an assumption,"},{"from":1741.45,"to":1744.39,"location":2,"content":"it's more a deliberate decision in the design of an RNN."},{"from":1744.39,"to":1746.46,"location":2,"content":"So, an RNN is by definition,"},{"from":1746.46,"to":1750.45,"location":2,"content":"a network where you apply the exact same weights on every step."},{"from":1750.45,"to":1753.8,"location":2,"content":"So, I suppose the question why do you assume maybe should be,"},{"from":1753.8,"to":1755.22,"location":2,"content":"why is that a good idea?"},{"from":1755.22,"to":1757.52,"location":2,"content":"Um, so I spoke a little bit about why it's a good idea,"},{"from":1757.52,"to":1758.69,"location":2,"content":"and this list of advantages,"},{"from":1758.69,"to":1764.56,"location":2,"content":"I suppose, are the reasons why you'd want to do that. Does that answer your question?"},{"from":1764.56,"to":1769.02,"location":2,"content":"Open their books, right? If you assume that WH are the same,"},{"from":1769.02,"to":1771.42,"location":2,"content":"you mean that like, uh,"},{"from":1771.42,"to":1774.66,"location":2,"content":"Markov chain, it's like a Markov chain."},{"from":1774.66,"to":1777.78,"location":2,"content":"Uh, the trans- transmit, uh,"},{"from":1777.78,"to":1782.95,"location":2,"content":"trans- transfer probability for the human moods open,"},{"from":1782.95,"to":1784.89,"location":2,"content":"they are the same,"},{"from":1784.89,"to":1790.94,"location":2,"content":"but actually the Markov chain."},{"from":1790.94,"to":1796.54,"location":2,"content":"The model, [inaudible] the transfer probability for that is the same,"},{"from":1796.54,"to":1800.89,"location":2,"content":"so [inaudible] probability,"},{"from":1800.89,"to":1807.11,"location":2,"content":"it- it's just an approximation but it's another test."},{"from":1807.11,"to":1808.24,"location":2,"content":"Okay. So I think that [OVERLAPPING]"},{"from":1808.24,"to":1810.81,"location":2,"content":"If you assume WH could be the same,"},{"from":1810.81,"to":1814.72,"location":2,"content":"it's good because you used a number of parameters,"},{"from":1814.72,"to":1820.56,"location":2,"content":"but this is just an, this is just an approximation."},{"from":1820.56,"to":1823.41,"location":2,"content":"The underlying transfer, uh,"},{"from":1823.41,"to":1825.66,"location":2,"content":"probability, it shouldn't be the same. Especially [OVERLAPPING]"},{"from":1825.66,"to":1828.84,"location":2,"content":"Okay. Um, so I think the question is saying that given the- these"},{"from":1828.84,"to":1830.54,"location":2,"content":"words the students opened their"},{"from":1830.54,"to":1832.49,"location":2,"content":"are all different and they're happening in different context,"},{"from":1832.49,"to":1835.85,"location":2,"content":"then why should we be applying the same transformation each time?"},{"from":1835.85,"to":1837.44,"location":2,"content":"So that's a- that's a good question."},{"from":1837.44,"to":1841.67,"location":2,"content":"I think, uh, the idea is that you are learning a general function, not just, you know,"},{"from":1841.67,"to":1843.54,"location":2,"content":"how to deal with students,"},{"from":1843.54,"to":1846.09,"location":2,"content":"the one-word students in this one context."},{"from":1846.09,"to":1848.52,"location":2,"content":"We're trying to learn a general function of how you"},{"from":1848.52,"to":1851.07,"location":2,"content":"should deal with a word given the word so far."},{"from":1851.07,"to":1855.09,"location":2,"content":"You're trying to learn a general representation of language and context so far,"},{"from":1855.09,"to":1857.06,"location":2,"content":"which is indeed a very difficult problem."},{"from":1857.06,"to":1860.17,"location":2,"content":"Um, I think you also mentioned that something about an approximation."},{"from":1860.17,"to":1861.78,"location":2,"content":"Uh, another thing to note is that all of"},{"from":1861.78,"to":1864.57,"location":2,"content":"the hidden states are vectors, they're not just single numbers, right?"},{"from":1864.57,"to":1866.67,"location":2,"content":"They are vectors of lengths, I don't know, 500 or something?"},{"from":1866.67,"to":1869.61,"location":2,"content":"So they have quite a large capacity to hold lots of information about"},{"from":1869.61,"to":1873.53,"location":2,"content":"different things in all of their different, um, positions."},{"from":1873.53,"to":1875.63,"location":2,"content":"So, I think the idea is that you can"},{"from":1875.63,"to":1878.26,"location":2,"content":"store a lot of different information in different contexts,"},{"from":1878.26,"to":1879.83,"location":2,"content":"in different parts of the hidden state,"},{"from":1879.83,"to":1881.96,"location":2,"content":"but it is indeed an approximation and there is"},{"from":1881.96,"to":1884.58,"location":2,"content":"some kind of limit to how much information you can store."},{"from":1884.58,"to":1886.85,"location":2,"content":"Okay, any other questions? Yes."},{"from":1886.85,"to":1889.41,"location":2,"content":"Since you kinda process any single length frame,"},{"from":1889.41,"to":1891.13,"location":2,"content":"what length do you use during your training?"},{"from":1891.13,"to":1895.04,"location":2,"content":"And does the length you use for training affect WH?"},{"from":1895.04,"to":1899.36,"location":2,"content":"Okay, so, the question is, given that you can have any length input,"},{"from":1899.36,"to":1901.95,"location":2,"content":"what length is the input during training?"},{"from":1901.95,"to":1904.18,"location":2,"content":"So, I suppose in practice,"},{"from":1904.18,"to":1906.51,"location":2,"content":"you choose how long the inputs are in"},{"from":1906.51,"to":1909.63,"location":2,"content":"training either based on what your data is or maybe based on,"},{"from":1909.63,"to":1912.62,"location":2,"content":"uh, your efficiency concerns so maybe you make it artificially"},{"from":1912.62,"to":1915.9,"location":2,"content":"shorter by chopping it up. Um, what was the other question?"},{"from":1915.9,"to":1918.36,"location":2,"content":"Uh, does WH depend on that?"},{"from":1918.36,"to":1921.26,"location":2,"content":"Okay. So the question was, does WH depend on the length you used?"},{"from":1921.26,"to":1924.08,"location":2,"content":"So, no, and that's one of the good things in the advantages list."},{"from":1924.08,"to":1927.16,"location":2,"content":"Is that the model size doesn't increase for longer input,"},{"from":1927.16,"to":1929.04,"location":2,"content":"because we just unroll the RNN"},{"from":1929.04,"to":1931.24,"location":2,"content":"applying the same weights again and again for as long as we'd like."},{"from":1931.24,"to":1933.93,"location":2,"content":"There's no need to have more weights just because you have a longer input."},{"from":1933.93,"to":1936.8,"location":2,"content":"[NOISE] Yeah."},{"from":1936.8,"to":1944.23,"location":2,"content":"So how the ratios that you mentioned are [inaudible] the number of words."},{"from":1944.23,"to":1948.4,"location":2,"content":"[NOISE] Are you asking about capital E or the lowercase E?"},{"from":1948.4,"to":1949.48,"location":2,"content":"Uh, lowercase E."},{"from":1949.48,"to":1950.79,"location":2,"content":"Okay. So, the question is,"},{"from":1950.79,"to":1952.89,"location":2,"content":"how do we choose the dimension of the lowercase Es?"},{"from":1952.89,"to":1954.3,"location":2,"content":"Uh, so, you could, for example,"},{"from":1954.3,"to":1957.12,"location":2,"content":"assume that those are just pre-trained word vectors like the ones that you,"},{"from":1957.12,"to":1958.82,"location":2,"content":"uh, used in assignment one."},{"from":1958.82,"to":1959.71,"location":2,"content":"More like word2vec."},{"from":1959.71,"to":1961.14,"location":2,"content":"Yeah. For example, word2vec,"},{"from":1961.14,"to":1962.61,"location":2,"content":"and you just download them and use them,"},{"from":1962.61,"to":1964.38,"location":2,"content":"or maybe you learn them from scratch, in which case,"},{"from":1964.38,"to":1966.93,"location":2,"content":"you decide at the beginning of training how big you want those vectors to be."},{"from":1966.93,"to":1969.21,"location":2,"content":"[NOISE] Okay. I'm gonna move on for now."},{"from":1969.21,"to":1974.89,"location":2,"content":"[NOISE] So, we've learned what an RNN language model is and we've learned how you would,"},{"from":1974.89,"to":1976.85,"location":2,"content":"uh, run one forward, but the question remains,"},{"from":1976.85,"to":1979.08,"location":2,"content":"how would you train an RNN language model?"},{"from":1979.08,"to":1982.23,"location":2,"content":"How would you learn it? [NOISE]"},{"from":1982.23,"to":1983.85,"location":2,"content":"So, as always, in machine learning,"},{"from":1983.85,"to":1986.67,"location":2,"content":"our answer starts with, you're going to get a big corpus of text,"},{"from":1986.67,"to":1991.23,"location":2,"content":"and we're gonna call that just a sequence of words X1 up to X capital T. So,"},{"from":1991.23,"to":1995.12,"location":2,"content":"you feed the sequence of words into the RNN language model, and then,"},{"from":1995.12,"to":1999.62,"location":2,"content":"the idea is that you compute the output distribution Y-hat T for every step T. So,"},{"from":1999.62,"to":2001.7,"location":2,"content":"I know that the picture I showed on the previous, uh,"},{"from":2001.7,"to":2003.56,"location":2,"content":"slide [NOISE] only showed us doing on the last step,"},{"from":2003.56,"to":2006.14,"location":2,"content":"but the idea is, you would actually compute this on every step."},{"from":2006.14,"to":2008.42,"location":2,"content":"So, this means that you're actually predicting"},{"from":2008.42,"to":2011,"location":2,"content":"the probability of the next word on every step."},{"from":2011,"to":2013.13,"location":2,"content":"[NOISE] Okay."},{"from":2013.13,"to":2015.52,"location":2,"content":"So, once you've done that, then you can define the loss function,"},{"from":2015.52,"to":2017.12,"location":2,"content":"and this should be familiar to you by now."},{"from":2017.12,"to":2019.19,"location":2,"content":"Uh, this is the cross-entropy between [NOISE]"},{"from":2019.19,"to":2023.91,"location":2,"content":"our predicted probability distribution Y-hat T and the true, uh,"},{"from":2023.91,"to":2027.26,"location":2,"content":"distribution, which is Y-hat- sorry, just YT,"},{"from":2027.26,"to":2029.57,"location":2,"content":"which is a one-hot vector, uh,"},{"from":2029.57,"to":2031.06,"location":2,"content":"representing the true next [NOISE] words,"},{"from":2031.06,"to":2032.49,"location":2,"content":"which is XT plus one."},{"from":2032.49,"to":2034.49,"location":2,"content":"So, as you've seen before, this, uh,"},{"from":2034.49,"to":2037.1,"location":2,"content":"cross-entropy [NOISE] between those two vectors can be written"},{"from":2037.1,"to":2040.64,"location":2,"content":"also as a negative log probability."},{"from":2040.64,"to":2045.63,"location":2,"content":"And then, lastly, if you average this cross-entropy loss across every step, uh,"},{"from":2045.63,"to":2048.74,"location":2,"content":"every T in the corpus time step T, then,"},{"from":2048.74,"to":2051.8,"location":2,"content":"uh, this gives you your overall loss for the entire training set."},{"from":2051.8,"to":2056.36,"location":2,"content":"[NOISE] Okay."},{"from":2056.36,"to":2058.47,"location":2,"content":"So, just to make that even more clear with a picture,"},{"from":2058.47,"to":2060.08,"location":2,"content":"uh, suppose that our corpus is,"},{"from":2060.08,"to":2061.37,"location":2,"content":"the students open their exams,"},{"from":2061.37,"to":2063.02,"location":2,"content":"et cetera, and it goes on for a long time."},{"from":2063.02,"to":2064.55,"location":2,"content":"Then, what we'd be doing is,"},{"from":2064.55,"to":2066.98,"location":2,"content":"we'd be running our RNN over this text, and then,"},{"from":2066.98,"to":2070.53,"location":2,"content":"on every step, we would be predicting the probability [NOISE] distribution Y-hats,"},{"from":2070.53,"to":2071.78,"location":2,"content":"and then, from each of those,"},{"from":2071.78,"to":2073.31,"location":2,"content":"you can calculate what your loss is,"},{"from":2073.31,"to":2076.4,"location":2,"content":"which is the JT, and then, uh, on the first step,"},{"from":2076.4,"to":2078.97,"location":2,"content":"the loss would be the negative log probability of the next word,"},{"from":2078.97,"to":2080.06,"location":2,"content":"which is, in this example,"},{"from":2080.06,"to":2082.04,"location":2,"content":"students, [NOISE] and so on."},{"from":2082.04,"to":2085.07,"location":2,"content":"Each of those is the negative log probability of the next word."},{"from":2085.07,"to":2087.51,"location":2,"content":"[NOISE] And then, once you've computed all of those,"},{"from":2087.51,"to":2089.59,"location":2,"content":"you can add them [NOISE] all up and average them,"},{"from":2089.59,"to":2091.16,"location":2,"content":"and then, this gives you your final loss."},{"from":2091.16,"to":2096.26,"location":2,"content":"[NOISE] Okay. So, there's a caveat here."},{"from":2096.26,"to":2099.93,"location":2,"content":"Um, computing the loss and gradients across the entire corpus,"},{"from":2099.93,"to":2102.35,"location":2,"content":"all of those words X1 up to X capital T is too"},{"from":2102.35,"to":2104.84,"location":2,"content":"expensive [NOISE] because your corpus is probably really big."},{"from":2104.84,"to":2107.81,"location":2,"content":"[NOISE] So, um, as a student asked earlier,"},{"from":2107.81,"to":2110.55,"location":2,"content":"uh, in practice, what do you actually regard as your sequence?"},{"from":2110.55,"to":2112.58,"location":2,"content":"So, in practice, you might regard your sequence as, uh,"},{"from":2112.58,"to":2114.59,"location":2,"content":"something like a sentence or a document,"},{"from":2114.59,"to":2117.43,"location":2,"content":"some shorter unit of text."},{"from":2117.43,"to":2120.89,"location":2,"content":"So, uh, another thing you'll do [NOISE] is, if you remember,"},{"from":2120.89,"to":2123.78,"location":2,"content":"stochastic gradient descent allows you to compute gradients"},{"from":2123.78,"to":2126.98,"location":2,"content":"for small chunks of data rather than the whole corpus at a time."},{"from":2126.98,"to":2129.28,"location":2,"content":"So, in practice, if you're training a language model,"},{"from":2129.28,"to":2132.83,"location":2,"content":"what you're actually likely to be doing is computing the loss for a sentence,"},{"from":2132.83,"to":2135.29,"location":2,"content":"but that's actually a batch of sentences, and then,"},{"from":2135.29,"to":2137.95,"location":2,"content":"you compute the gradients with respect to that batch of sentences,"},{"from":2137.95,"to":2139.76,"location":2,"content":"update your weights, and repeat."},{"from":2139.76,"to":2146.41,"location":2,"content":"Any questions at this point? [NOISE] Okay."},{"from":2146.41,"to":2148.04,"location":2,"content":"So, uh, moving onto backprop."},{"from":2148.04,"to":2151.05,"location":2,"content":"Don't worry, there won't be as much backprop as there was last week,"},{"from":2151.05,"to":2153.23,"location":2,"content":"but, uh, there's an interesting question here, right?"},{"from":2153.23,"to":2155.9,"location":2,"content":"So, the, uh, characteristic thing about RNNs"},{"from":2155.9,"to":2158.97,"location":2,"content":"is that they apply the same weight matrix repeatedly."},{"from":2158.97,"to":2160.28,"location":2,"content":"So, the question is,"},{"from":2160.28,"to":2162.22,"location":2,"content":"[NOISE] what's the derivative of our loss function,"},{"from":2162.22,"to":2163.61,"location":2,"content":"let's say, on step T?"},{"from":2163.61,"to":2168.64,"location":2,"content":"What's the derivative of that loss with respect to the repeated weight matrix WH?"},{"from":2168.64,"to":2173.57,"location":2,"content":"So, the answer is that the derivative of the loss, uh,"},{"from":2173.57,"to":2176.39,"location":2,"content":"the gradient with respect to the repeated weight is"},{"from":2176.39,"to":2179.78,"location":2,"content":"the sum of the gradient with respect to each time it appears,"},{"from":2179.78,"to":2181.36,"location":2,"content":"and that's what that equation says."},{"from":2181.36,"to":2185.61,"location":2,"content":"So, on the right, the notation with the vertical line and the I is saying, uh,"},{"from":2185.61,"to":2190.67,"location":2,"content":"the derivative of the loss with respect to WH when it appears on the Ith step."},{"from":2190.67,"to":2192.77,"location":2,"content":"Okay. So, so, why is that true?"},{"from":2192.77,"to":2195.26,"location":2,"content":"[NOISE] Uh, to sketch why this is true,"},{"from":2195.26,"to":2197.84,"location":2,"content":"uh, [NOISE] I'm gonna remind you of the multivariable chain rule."},{"from":2197.84,"to":2202.53,"location":2,"content":"So, uh, this is a screenshot from a Khan Academy article on the multivariable chain rule,"},{"from":2202.53,"to":2204.44,"location":2,"content":"and, uh, I advise you check it out if you"},{"from":2204.44,"to":2206.63,"location":2,"content":"want to learn more because it's very easy to understand."},{"from":2206.63,"to":2208.22,"location":2,"content":"Uh, and what it says is,"},{"from":2208.22,"to":2212.05,"location":2,"content":"given a function F [NOISE] which depends on X and Y,"},{"from":2212.05,"to":2216.14,"location":2,"content":"which are both themselves functions of some variable T, then,"},{"from":2216.14,"to":2219.43,"location":2,"content":"if you want to get the derivative of F with respect to T,"},{"from":2219.43,"to":2224.38,"location":2,"content":"then you need to do the chain ru- rule across X and Y separately and then add them up."},{"from":2224.38,"to":2227.02,"location":2,"content":"[NOISE] So, that's the multivariable chain rule,"},{"from":2227.02,"to":2230.51,"location":2,"content":"[NOISE] and if we apply this to our scenario with trying to take"},{"from":2230.51,"to":2234.89,"location":2,"content":"the derivative of the loss JT with respect to our weight matrix WH,"},{"from":2234.89,"to":2239.3,"location":2,"content":"then you could view it as this kind of diagram [NOISE] where WH has, uh,"},{"from":2239.3,"to":2242.81,"location":2,"content":"a relationship with all of these individual appearances of WH,"},{"from":2242.81,"to":2243.86,"location":2,"content":"but it's a [NOISE] simple relationship,"},{"from":2243.86,"to":2245.49,"location":2,"content":"it's just equality, and then,"},{"from":2245.49,"to":2249.69,"location":2,"content":"each of those appearances of WH affect the loss in different ways."},{"from":2249.69,"to":2254.08,"location":2,"content":"So, then, if we apply the multivariable chain rule,"},{"from":2254.08,"to":2257.47,"location":2,"content":"then it says that the derivative of the loss with respect to"},{"from":2257.47,"to":2261.19,"location":2,"content":"WH is the sum of those chain rule things,"},{"from":2261.19,"to":2265.6,"location":2,"content":"but the expression on the right is just one because it's an equality relation,"},{"from":2265.6,"to":2270.48,"location":2,"content":"[NOISE] and then, that gives us the equation that I wrote on the previous slide."},{"from":2270.48,"to":2275.24,"location":2,"content":"So, this is a proof sketch for why the derivative of the loss with"},{"from":2275.24,"to":2280.57,"location":2,"content":"respect to our recurrent matrix is the sum of the derivatives each time it appears."},{"from":2280.57,"to":2283.19,"location":2,"content":"Okay. So, suppose you believe me on that, that is,"},{"from":2283.19,"to":2284.55,"location":2,"content":"how you compute the, uh,"},{"from":2284.55,"to":2286.47,"location":2,"content":"gradient with respect to the recurrent weight."},{"from":2286.47,"to":2288.44,"location":2,"content":"So, a remaining question is, well,"},{"from":2288.44,"to":2290.72,"location":2,"content":"how [NOISE] do we actually calculate this in practice?"},{"from":2290.72,"to":2296.66,"location":2,"content":"[NOISE] So, the answer is that you're going to calculate this sum by doing backprop,"},{"from":2296.66,"to":2299.39,"location":2,"content":"uh, backwards, kind of right to left, um,"},{"from":2299.39,"to":2303.59,"location":2,"content":"through the RNN, and you're going to accumulate this sum as you go."},{"from":2303.59,"to":2304.94,"location":2,"content":"So, the important thing is,"},{"from":2304.94,"to":2308.43,"location":2,"content":"you shouldn't compute each of those things separately, uh,"},{"from":2308.43,"to":2310.88,"location":2,"content":"you should compute them by accumulating, like,"},{"from":2310.88,"to":2314.36,"location":2,"content":"each one can be computed in form- in terms of the previous one."},{"from":2314.36,"to":2319.13,"location":2,"content":"[NOISE] So, this algorithm of computing each of these,"},{"from":2319.13,"to":2321.32,"location":2,"content":"uh, each of these gradients with respect to"},{"from":2321.32,"to":2324.3,"location":2,"content":"the previous one is called backpropagation through time."},{"from":2324.3,"to":2327.65,"location":2,"content":"And, um, I always think that this sounds way more sci-fi than it is."},{"from":2327.65,"to":2329.03,"location":2,"content":"It sounds like it's time travel or something,"},{"from":2329.03,"to":2330.56,"location":2,"content":"but it's actually pretty simple."},{"from":2330.56,"to":2333.29,"location":2,"content":"Uh, it's just the name you give to"},{"from":2333.29,"to":2337.96,"location":2,"content":"applying the backprop algorithm to a recurrent neural network."},{"from":2337.96,"to":2342.35,"location":2,"content":"Any questions at this point? Yep. [NOISE]"},{"from":2342.35,"to":2347.24,"location":2,"content":"So, it seems that how you break up the batches matter your end result."},{"from":2347.24,"to":2355.7,"location":2,"content":"[inaudible]."},{"from":2355.7,"to":2361.46,"location":2,"content":"So, if you break it into much more [inaudible]."},{"from":2361.46,"to":2363.61,"location":2,"content":"Okay. So the question is, um, surely,"},{"from":2363.61,"to":2367.86,"location":2,"content":"how you decide to break up your batches affects how you learn, right?"},{"from":2367.86,"to":2369.56,"location":2,"content":"Because if you choose, uh,"},{"from":2369.56,"to":2371.66,"location":2,"content":"one set of data to be your batch, right, then,"},{"from":2371.66,"to":2373.88,"location":2,"content":"you will make your update based on that, and then,"},{"from":2373.88,"to":2376.76,"location":2,"content":"you only update the next one based on [NOISE] where you go from there."},{"from":2376.76,"to":2378.95,"location":2,"content":"So, if you decided to put different data in the batch,"},{"from":2378.95,"to":2380.49,"location":2,"content":"then you would have made a different step."},{"from":2380.49,"to":2382.91,"location":2,"content":"So, that's true, [NOISE] and that is why"},{"from":2382.91,"to":2385.91,"location":2,"content":"stochastic gradient descent is only an approximation of"},{"from":2385.91,"to":2389.66,"location":2,"content":"true gradient descent because the gradient that you compute with"},{"from":2389.66,"to":2393.95,"location":2,"content":"respect to one batch is just an approximation of the true gradient with respect to the,"},{"from":2393.95,"to":2396.09,"location":2,"content":"uh, the loss over the whole corpus."},{"from":2396.09,"to":2398.16,"location":2,"content":"So, yes, it's true that it's an approximation"},{"from":2398.16,"to":2400.58,"location":2,"content":"and how [NOISE] you choose to batch up your data can matter,"},{"from":2400.58,"to":2403.04,"location":2,"content":"and that's why, for example, shuffling your data is a good idea,"},{"from":2403.04,"to":2405.57,"location":2,"content":"and shuffling it differently, each epoch, is a good idea."},{"from":2405.57,"to":2409.13,"location":2,"content":"Uh, but the, the core idea of SGD is [NOISE] that, um,"},{"from":2409.13,"to":2412.09,"location":2,"content":"it should be a good enough approximation that over many steps,"},{"from":2412.09,"to":2414.74,"location":2,"content":"you will, uh, minimize your loss."},{"from":2414.74,"to":2433.01,"location":2,"content":"[NOISE] Any other questions? [NOISE] Yeah."},{"from":2433.01,"to":2435.41,"location":2,"content":"[NOISE] So, is, uh, is the question,"},{"from":2435.41,"to":2437.18,"location":2,"content":"as you compute forward prop,"},{"from":2437.18,"to":2440.34,"location":2,"content":"do you start computing backprop before you've even, like, got to the loss?"},{"from":2440.34,"to":2441.62,"location":2,"content":"Is that the question? [NOISE]"},{"from":2441.62,"to":2442.32,"location":2,"content":"Yes."},{"from":2442.32,"to":2445.64,"location":2,"content":"I didn't think so, right? Because you need to know what the loss is in"},{"from":2445.64,"to":2449.03,"location":2,"content":"order to compute the derivative of the loss with respect to something."},{"from":2449.03,"to":2450.56,"location":2,"content":"So, I think you need to get to the end."},{"from":2450.56,"to":2451.76,"location":2,"content":"So, if we assume simplicity,"},{"from":2451.76,"to":2454.49,"location":2,"content":"that there is only one loss which you get at the end of several steps,"},{"from":2454.49,"to":2455.59,"location":2,"content":"then you need to get to the end,"},{"from":2455.59,"to":2459.36,"location":2,"content":"compute the loss before you can compute the derivatives."},{"from":2459.36,"to":2462.2,"location":2,"content":"But I suppose you, you, you could compute the derivative of two,"},{"from":2462.2,"to":2464.24,"location":2,"content":"kind of, adjacent things of one with respect to the other."},{"from":2464.24,"to":2465.47,"location":2,"content":"[OVERLAPPING] But, yeah. [NOISE]"},{"from":2465.47,"to":2467.78,"location":2,"content":"As you're going forward, do- you need to sort of keep a track of what,"},{"from":2467.78,"to":2473.72,"location":2,"content":"what you would have [inaudible] the one you eventually get the loss. [inaudible]"},{"from":2473.72,"to":2475.86,"location":2,"content":"Yes. So, when you forward prop,"},{"from":2475.86,"to":2479.66,"location":2,"content":"you certainly have to hang on to all of the intervening factors."},{"from":2479.66,"to":2480.68,"location":2,"content":"[NOISE] Okay. I'm gonna move on for now."},{"from":2480.68,"to":2484.79,"location":2,"content":"Uh, so, that was a maths-heavy bit but,"},{"from":2484.79,"to":2487.13,"location":2,"content":"um, now, we're getting on to text generation,"},{"from":2487.13,"to":2488.68,"location":2,"content":"which someone asked about earlier."},{"from":2488.68,"to":2492.97,"location":2,"content":"So, um, just as we use the n-gram language model to generate text,"},{"from":2492.97,"to":2496.11,"location":2,"content":"you can also use an RNN language model to generate text,"},{"from":2496.11,"to":2498.65,"location":2,"content":"uh, via the same repeated sampling technique."},{"from":2498.65,"to":2501.05,"location":2,"content":"Um, so, here's a picture of how that would work."},{"from":2501.05,"to":2503.99,"location":2,"content":"How you start off with your initial hidden state H0, uh,"},{"from":2503.99,"to":2506.33,"location":2,"content":"which, uh, we have either as a parameter of"},{"from":2506.33,"to":2509.06,"location":2,"content":"the model or we initialize it to zero, or something like that."},{"from":2509.06,"to":2511.34,"location":2,"content":"So, let's suppose that we have the first word my,"},{"from":2511.34,"to":2514.24,"location":2,"content":"and Iet's suppose I, um, supply that to the model."},{"from":2514.24,"to":2517.24,"location":2,"content":"So, then, using the inputs and the initial hidden state,"},{"from":2517.24,"to":2519.2,"location":2,"content":"you can get our first hidden state H1."},{"from":2519.2,"to":2521.55,"location":2,"content":"And then from there, we can compute the, er,"},{"from":2521.55,"to":2524.76,"location":2,"content":"probability distribution Y hat one of what's coming next,"},{"from":2524.76,"to":2527.43,"location":2,"content":"and then we can use that distribution to sample some word."},{"from":2527.43,"to":2529.39,"location":2,"content":"So let's suppose that we sampled the word favorite."},{"from":2529.39,"to":2534.2,"location":2,"content":"So, the idea is that we use the outputted word as the input on the next step."},{"from":2534.2,"to":2536.96,"location":2,"content":"So, we feed favorite into the second step of the RNN,"},{"from":2536.96,"to":2538.22,"location":2,"content":"we get a new hidden state,"},{"from":2538.22,"to":2540.78,"location":2,"content":"and again we get a new probability distribution,"},{"from":2540.78,"to":2542.89,"location":2,"content":"and from that we can sample a new word."},{"from":2542.89,"to":2545.68,"location":2,"content":"So, we can just continue doing this process again and again,"},{"from":2545.68,"to":2547.68,"location":2,"content":"and in this way we can generate some text."},{"from":2547.68,"to":2549.5,"location":2,"content":"So, uh, here we've generated the text,"},{"from":2549.5,"to":2550.76,"location":2,"content":"My favorite season is Spring,"},{"from":2550.76,"to":2556.06,"location":2,"content":"and we can keep going for as long as we'd like."},{"from":2556.06,"to":2559.13,"location":2,"content":"Okay, so, uh, let's have some fun with this."},{"from":2559.13,"to":2561.39,"location":2,"content":"Uh, you can generate,"},{"from":2561.39,"to":2563.89,"location":2,"content":"uh, text using an RNN language model."},{"from":2563.89,"to":2568.07,"location":2,"content":"If you train the RNN language model on any kind of text,"},{"from":2568.07,"to":2571.34,"location":2,"content":"then you can use it to generate text in that style."},{"from":2571.34,"to":2573.38,"location":2,"content":"And in fact, this has become a whole kind of"},{"from":2573.38,"to":2575.78,"location":2,"content":"genre of internet humor that you might've seen."},{"from":2575.78,"to":2577.59,"location":2,"content":"So, uh, for example,"},{"from":2577.59,"to":2580.93,"location":2,"content":"here is an RNN language model trained on Obama speeches,"},{"from":2580.93,"to":2583.1,"location":2,"content":"and I found this in a blog post online."},{"from":2583.1,"to":2587.12,"location":2,"content":"So, here's the text that the RNN language model generated."},{"from":2587.12,"to":2591.35,"location":2,"content":"\"The United States will step up to the cost of a new challenges of"},{"from":2591.35,"to":2595.52,"location":2,"content":"the American people that will share the fact that we created the problem."},{"from":2595.52,"to":2599.63,"location":2,"content":"They were attacked and so that they have to say that"},{"from":2599.63,"to":2604.19,"location":2,"content":"all the task of the final days of war that I will not be able to get this done.\""},{"from":2604.19,"to":2607.13,"location":2,"content":"[LAUGHTER] Okay."},{"from":2607.13,"to":2610.2,"location":2,"content":"So, if we look at this and"},{"from":2610.2,"to":2612.23,"location":2,"content":"especially think about what did"},{"from":2612.23,"to":2614.57,"location":2,"content":"that text look like that we got from the n-gram language model,"},{"from":2614.57,"to":2616.16,"location":2,"content":"the one about the, the price of gold."},{"from":2616.16,"to":2619.72,"location":2,"content":"Um, I'd say that this is kind of recognizably better than that."},{"from":2619.72,"to":2621.62,"location":2,"content":"It seems more fluent overall."},{"from":2621.62,"to":2623.69,"location":2,"content":"Uh, I'd say it has a more of"},{"from":2623.69,"to":2628.53,"location":2,"content":"a sustained context in that it kind of makes sense for longer stretches at a time,"},{"from":2628.53,"to":2631.67,"location":2,"content":"and I'd say it does sound totally like Obama as well."},{"from":2631.67,"to":2633.03,"location":2,"content":"So, all of that's pretty good,"},{"from":2633.03,"to":2635.74,"location":2,"content":"but you can see that it's still pretty incoherent overall,"},{"from":2635.74,"to":2638.93,"location":2,"content":"like i- it was quite difficult to read it because it didn't really make sense, right?"},{"from":2638.93,"to":2640.13,"location":2,"content":"So I had to read the words carefully."},{"from":2640.13,"to":2642.89,"location":2,"content":"Um, so, yeah, I think this shows"},{"from":2642.89,"to":2646.31,"location":2,"content":"some of the progress you can get from using RNNs to generate text but still,"},{"from":2646.31,"to":2649.61,"location":2,"content":"um, very far from human level. Here are some more examples."},{"from":2649.61,"to":2653.28,"location":2,"content":"Uh, here's an RNN language model that was trained on the Harry Potter books."},{"from":2653.28,"to":2657.09,"location":2,"content":"And here's what it said. \"Sorry.\" Harry shouted, panicking."},{"from":2657.09,"to":2659.6,"location":2,"content":"\"I'll leave those brooms in London.\" Are they?"},{"from":2659.6,"to":2661.88,"location":2,"content":"\"No idea.\" said Nearly Headless Nick,"},{"from":2661.88,"to":2663.74,"location":2,"content":"casting low close by Cedric,"},{"from":2663.74,"to":2666.98,"location":2,"content":"carrying the last bit of treacle Charms from Harry's shoulder."},{"from":2666.98,"to":2669.29,"location":2,"content":"And to answer him the common room perched upon it,"},{"from":2669.29,"to":2673.03,"location":2,"content":"four arms held a shining knob from when the Spider hadn't felt it seemed."},{"from":2673.03,"to":2674.86,"location":2,"content":"He reached the teams too.\""},{"from":2674.86,"to":2678.07,"location":2,"content":"So, again, I'd say that this is fairly fluent."},{"from":2678.07,"to":2680,"location":2,"content":"It sounds totally like the Harry Potter books."},{"from":2680,"to":2681.71,"location":2,"content":"In fact, I'm pretty impressed by how much it does"},{"from":2681.71,"to":2684.17,"location":2,"content":"sound like in the voice of the Harry Potter books."},{"from":2684.17,"to":2686.51,"location":2,"content":"You even got some character attributes,"},{"from":2686.51,"to":2690.39,"location":2,"content":"I'd say that Harry the character does often panic in the book so that seems right."},{"from":2690.39,"to":2694.52,"location":2,"content":"Um, [LAUGHTER] but some bad things are that we have,"},{"from":2694.52,"to":2698.66,"location":2,"content":"for example, a pretty long run-on sentence in the second paragraph that's hard to read."},{"from":2698.66,"to":2701.49,"location":2,"content":"Uh, you have some nonsensical things that really make no sense."},{"from":2701.49,"to":2703.2,"location":2,"content":"Like, I don't know what a treacle charm is."},{"from":2703.2,"to":2704.89,"location":2,"content":"It sounds delicious but I don't think it's real,"},{"from":2704.89,"to":2707.79,"location":2,"content":"uh, and overall it's just pretty nonsensical."},{"from":2707.79,"to":2712.86,"location":2,"content":"Here's another example. Here is an RNN language model that was trained on recipes."},{"from":2712.86,"to":2716,"location":2,"content":"So, uh, [LAUGHTER] this one's pretty bizarre,"},{"from":2716,"to":2718.57,"location":2,"content":"the title is 'chocolate ranch barbecue',"},{"from":2718.57,"to":2720.95,"location":2,"content":"It contains Parmesan cheese,"},{"from":2720.95,"to":2725.55,"location":2,"content":"coconut milk, eggs, and the recipe says place each pasta over layers of lumps,"},{"from":2725.55,"to":2729.5,"location":2,"content":"shape mixture into the moderate oven and simmer until firm."},{"from":2729.5,"to":2731.21,"location":2,"content":"Serve hot in bodied fresh,"},{"from":2731.21,"to":2732.57,"location":2,"content":"mustard orange and cheese."},{"from":2732.57,"to":2735.82,"location":2,"content":"Combine the cheese and salt together the dough in a large skillet;"},{"from":2735.82,"to":2738.14,"location":2,"content":"add the ingredients and stir in the chocolate and pepper."},{"from":2738.14,"to":2741.64,"location":2,"content":"[LAUGHTER] Um, so, one thing that I think is"},{"from":2741.64,"to":2745.34,"location":2,"content":"even more clear here in the recipes example than the prose example,"},{"from":2745.34,"to":2749.41,"location":2,"content":"is the inability to remember what's [NOISE] what's happening overall, right?"},{"from":2749.41,"to":2753.02,"location":2,"content":"Cuz a recipe you could say is pretty challenging because you need to remember"},{"from":2753.02,"to":2757.1,"location":2,"content":"the title of what you're trying to make which in this case is chocolate ranch barbecue,"},{"from":2757.1,"to":2759.47,"location":2,"content":"and you need to actually, you know, make that thing by the end."},{"from":2759.47,"to":2761.06,"location":2,"content":"Uh, you also need to remember what were the ingredients"},{"from":2761.06,"to":2762.5,"location":2,"content":"in the beginning and did you use them."},{"from":2762.5,"to":2765.23,"location":2,"content":"And in a recipe, if you make something and put it in the oven,"},{"from":2765.23,"to":2767.72,"location":2,"content":"you need to take it out later, a- and stuff like that, right?"},{"from":2767.72,"to":2769.4,"location":2,"content":"So, clearly it's not really"},{"from":2769.4,"to":2771.89,"location":2,"content":"remembering what's happening overall or what it's trying to do,"},{"from":2771.89,"to":2773.91,"location":2,"content":"it seems to be just generating kind of"},{"from":2773.91,"to":2777.78,"location":2,"content":"generic recipe sentences and putting them in a random order."},{"from":2777.78,"to":2780.64,"location":2,"content":"Uh, but again, I mean, we can see that it's fairly fluent,"},{"from":2780.64,"to":2783.35,"location":2,"content":"it's grammatically right, it kind of sounds like a recipe."},{"from":2783.35,"to":2785.86,"location":2,"content":"Uh, but the problem is it's just nonsensical."},{"from":2785.86,"to":2788.3,"location":2,"content":"Like for example, shape mixture into"},{"from":2788.3,"to":2791.34,"location":2,"content":"the moderate oven is grammatical but it doesn't make any sense."},{"from":2791.34,"to":2793.3,"location":2,"content":"Okay, last example."},{"from":2793.3,"to":2797.51,"location":2,"content":"So, here's an RNN language model that's trained on paint-color names."},{"from":2797.51,"to":2801.2,"location":2,"content":"And this is an example of a character-level language model because"},{"from":2801.2,"to":2804.84,"location":2,"content":"it's predicting what character comes next not what word comes next."},{"from":2804.84,"to":2807.65,"location":2,"content":"And this is why it's able to come up with new words."},{"from":2807.65,"to":2809.84,"location":2,"content":"Another thing to note is that this language model was"},{"from":2809.84,"to":2812.09,"location":2,"content":"trained to be conditioned on some kind of input."},{"from":2812.09,"to":2815.78,"location":2,"content":"So here, the input is the color itself I think represented by the three numbers,"},{"from":2815.78,"to":2817.14,"location":2,"content":"that's probably RGB numbers."},{"from":2817.14,"to":2820.93,"location":2,"content":"And it generated some names for the colors."},{"from":2820.93,"to":2822.14,"location":2,"content":"And I think these are pretty funny."},{"from":2822.14,"to":2824.06,"location":2,"content":"My favorite one is Stanky Bean,"},{"from":2824.06,"to":2825.14,"location":2,"content":"which is in the bottom right."},{"from":2825.14,"to":2827.93,"location":2,"content":"[LAUGHTER] Um, so, it's pretty creative,"},{"from":2827.93,"to":2830.21,"location":2,"content":"[LAUGHTER] and I think these do sound kind of"},{"from":2830.21,"to":2833.36,"location":2,"content":"like paint colors but often they're quite bizarre."},{"from":2833.36,"to":2840.91,"location":2,"content":"[LAUGHTER] Light of Blast is pretty good too."},{"from":2840.91,"to":2843.5,"location":2,"content":"So, uh, you're gonna learn more about"},{"from":2843.5,"to":2845.76,"location":2,"content":"character-level language models in a future lecture,"},{"from":2845.76,"to":2848.87,"location":2,"content":"and you're also going to learn more about how to condition a language model"},{"from":2848.87,"to":2852.44,"location":2,"content":"based on some kind of input such as the color, um, code."},{"from":2852.44,"to":2854.33,"location":2,"content":"So, these are pretty funny,"},{"from":2854.33,"to":2855.89,"location":2,"content":"uh, but I do want to say a warning."},{"from":2855.89,"to":2858.92,"location":2,"content":"Um, you'll find a lot of these kinds of articles online,"},{"from":2858.92,"to":2860.59,"location":2,"content":"uh, often with headlines like,"},{"from":2860.59,"to":2863,"location":2,"content":"\"We forced a bot to watch, you know,"},{"from":2863,"to":2866.7,"location":2,"content":"1000 hours of sci-fi movies and it wrote a script,\" something like that."},{"from":2866.7,"to":2870.8,"location":2,"content":"Um, so, my advice is you have to take these with a big pinch of salt, because often,"},{"from":2870.8,"to":2873.08,"location":2,"content":"uh, the examples that people put online were"},{"from":2873.08,"to":2875.38,"location":2,"content":"hand selected by humans to be the funniest examples."},{"from":2875.38,"to":2878.66,"location":2,"content":"Like I think all of the examples I've shown today were definitely hand selected"},{"from":2878.66,"to":2882.2,"location":2,"content":"by humans as the funniest examples that the RNN came up with."},{"from":2882.2,"to":2885.45,"location":2,"content":"And in some cases they might even have been edited by a human."},{"from":2885.45,"to":2888.56,"location":2,"content":"So, uh, yeah, you do need to be a little bit skeptical when you look at these examples."},{"from":2888.56,"to":2890.2,"location":2,"content":"[OVERLAPPING] Yep."},{"from":2890.2,"to":2892.93,"location":2,"content":"So, uh, in the Harry Potter one,"},{"from":2892.93,"to":2896.63,"location":2,"content":"there was a opening quote and then there was a closing quote."},{"from":2896.63,"to":2898.74,"location":2,"content":"So, like do you expect the RNN,"},{"from":2898.74,"to":2902,"location":2,"content":"like when it puts that opening quote and keeps putting more words,"},{"from":2902,"to":2908.82,"location":2,"content":"do you expect the probability of a closing quote to like increase as you're going or decrease?"},{"from":2908.82,"to":2911.15,"location":2,"content":"That's a great question. So, uh,"},{"from":2911.15,"to":2912.51,"location":2,"content":"the question was, uh,"},{"from":2912.51,"to":2914.45,"location":2,"content":"we noticed that in the Harry Potter example,"},{"from":2914.45,"to":2916.3,"location":2,"content":"there was some open quotes and some closed quotes."},{"from":2916.3,"to":2918.41,"location":2,"content":"And it looks like the model didn't screw up, right?"},{"from":2918.41,"to":2920.07,"location":2,"content":"All of these open quotes and closed quotes,"},{"from":2920.07,"to":2921.82,"location":2,"content":"uh, are in the correct places."},{"from":2921.82,"to":2924.45,"location":2,"content":"So, the question is, do we expect the model to put"},{"from":2924.45,"to":2928.78,"location":2,"content":"a higher probability on closing the quote given that is inside a quo- quote passage?"},{"from":2928.78,"to":2931.11,"location":2,"content":"So, I should say definitely yes and"},{"from":2931.11,"to":2934.22,"location":2,"content":"that's most- mostly the explanation for why this works."},{"from":2934.22,"to":2936.5,"location":2,"content":"Um, there's been some really interesting work in trying"},{"from":2936.5,"to":2938.54,"location":2,"content":"to look inside the hidden states of, uh,"},{"from":2938.54,"to":2941.34,"location":2,"content":"language models to see whether it's tracking things like,"},{"from":2941.34,"to":2943.61,"location":2,"content":"are we inside an open quote or a close quote?"},{"from":2943.61,"to":2946.43,"location":2,"content":"And there has been some limited evidence to show that"},{"from":2946.43,"to":2949.37,"location":2,"content":"maybe there are certain neuron or neurons inside the hidden state,"},{"from":2949.37,"to":2950.9,"location":2,"content":"which are tracking things like,"},{"from":2950.9,"to":2952.55,"location":2,"content":"are we currently inside a quote or not?"},{"from":2952.55,"to":2953.86,"location":2,"content":"[NOISE]. Yeah."},{"from":2953.86,"to":2958.37,"location":2,"content":"So, so, like do you think the probability would increase as you go more to the right [OVERLAPPING]?"},{"from":2958.37,"to":2962.27,"location":2,"content":"So, the question is as the quote passage goes on for longer,"},{"from":2962.27,"to":2963.74,"location":2,"content":"do you think the priority or"},{"from":2963.74,"to":2966.77,"location":2,"content":"the probability of outputting a closed quote should increase?"},{"from":2966.77,"to":2968.05,"location":2,"content":"Um, I don't know."},{"from":2968.05,"to":2971.42,"location":2,"content":"Maybe. Um, that would be good, I suppose,"},{"from":2971.42,"to":2972.98,"location":2,"content":"because you don't want an infinite quote,"},{"from":2972.98,"to":2975.65,"location":2,"content":"uh, but I wouldn't be surprised if that didn't happen."},{"from":2975.65,"to":2979.4,"location":2,"content":"Like I wouldn't be surprised if maybe some other worse-trained language models,"},{"from":2979.4,"to":2981.39,"location":2,"content":"just opened quotes and never closed them."},{"from":2981.39,"to":2984.82,"location":2,"content":"Uh, any other questions? Yeah."},{"from":2984.82,"to":2987.61,"location":2,"content":"What are the dimensions of the W metric?"},{"from":2987.61,"to":2990.71,"location":2,"content":"Okay. So, the question is what are the dimensions of the W metric?"},{"from":2990.71,"to":2992.48,"location":2,"content":"So we're going back to the online stuff."},{"from":2992.48,"to":2995.9,"location":2,"content":"Uh, okay. You're asking me about W_h or W_e or something else?"},{"from":2995.9,"to":2996.61,"location":2,"content":"Yeah."},{"from":2996.61,"to":2998.96,"location":2,"content":"So, W_h will be,"},{"from":2998.96,"to":3001.43,"location":2,"content":"uh, if we say that the hidden size has size n,"},{"from":3001.43,"to":3007.24,"location":2,"content":"then W_h will be n by n. And if we suppose that the embeddings have size d,"},{"from":3007.24,"to":3008.64,"location":2,"content":"then W_e will be, uh,"},{"from":3008.64,"to":3012.55,"location":2,"content":"d by n, n by d, maybe."},{"from":3012.55,"to":3019.99,"location":2,"content":"Does that answer your question? [NOISE] Uh,"},{"from":3019.99,"to":3023.38,"location":2,"content":"any other questions about generating or anything? Yep."},{"from":3023.38,"to":3028.03,"location":2,"content":"So, you said that there was a long sentence in the Harry Potter-related text?"},{"from":3028.03,"to":3028.43,"location":2,"content":"Yeah."},{"from":3028.43,"to":3033.64,"location":2,"content":"Is it ever sort of practical to combine RNNs with like in this hand written rules?"},{"from":3033.64,"to":3035.39,"location":2,"content":"Sorry. Is it ever practical to combine-"},{"from":3035.39,"to":3037.81,"location":2,"content":"RNNs with a written list of hand-written rules."},{"from":3037.81,"to":3038.83,"location":2,"content":"[OVERLAPPING]"},{"from":3038.83,"to":3039.88,"location":2,"content":"Okay. Yeah. That's a great question."},{"from":3039.88,"to":3042.22,"location":2,"content":"So the question was, is it ever practical to"},{"from":3042.22,"to":3044.98,"location":2,"content":"combine RNNs with a list of hand-written rules?"},{"from":3044.98,"to":3049.28,"location":2,"content":"For example, don't let your sentence be longer than this many words."},{"from":3049.28,"to":3050.53,"location":2,"content":"Um, so yeah."},{"from":3050.53,"to":3054.07,"location":2,"content":"I'd say it probably is practical maybe especially if you're interested in, uh,"},{"from":3054.07,"to":3056.26,"location":2,"content":"making sure that certain bad things don't happen,"},{"from":3056.26,"to":3061.9,"location":2,"content":"you might apply some hacky rules like yeah forcing it to end, uh, early."},{"from":3061.9,"to":3063.58,"location":2,"content":"I mean, okay. So there's this thing called Beam Search"},{"from":3063.58,"to":3065.34,"location":2,"content":"which we're going to learn about in a later lecture,"},{"from":3065.34,"to":3066.64,"location":2,"content":"which essentially doesn't just,"},{"from":3066.64,"to":3069.34,"location":2,"content":"um, choose one word in each step and continue."},{"from":3069.34,"to":3072.32,"location":2,"content":"It explores many different options for words you could generate."},{"from":3072.32,"to":3074.41,"location":2,"content":"And you can apply some kinds of rules on that"},{"from":3074.41,"to":3076.54,"location":2,"content":"where if you have lots of different things to choose from,"},{"from":3076.54,"to":3078.25,"location":2,"content":"then you can maybe get rid of"},{"from":3078.25,"to":3081.26,"location":2,"content":"some options if you don't like them because they break some of your rules."},{"from":3081.26,"to":3089.49,"location":2,"content":"But, um, it can be difficult to do. Any other questions?"},{"from":3089.49,"to":3098.38,"location":2,"content":"Okay. Um, so we've talked about generating from language models."},{"from":3098.38,"to":3100.63,"location":2,"content":"Uh, so unfortunately, you can't just use"},{"from":3100.63,"to":3104.14,"location":2,"content":"generation as your evaluation metric for the language models."},{"from":3104.14,"to":3107.24,"location":2,"content":"You do need some kind of, um, measurable metric."},{"from":3107.24,"to":3112.01,"location":2,"content":"So, the standard evaluation metric for language models is called perplexity."},{"from":3112.01,"to":3114.25,"location":2,"content":"And, uh, perplexity is defined as"},{"from":3114.25,"to":3118.48,"location":2,"content":"the inverse probability of the corpus according to the language model."},{"from":3118.48,"to":3122.2,"location":2,"content":"So, if you look at it you can see that that's what this formula is saying."},{"from":3122.2,"to":3124.07,"location":2,"content":"It's saying that for every, uh,"},{"from":3124.07,"to":3127.55,"location":2,"content":"word xt, lowercase t, in the corpus, uh,"},{"from":3127.55,"to":3130.42,"location":2,"content":"we're computing the probability of that word given"},{"from":3130.42,"to":3133.63,"location":2,"content":"everything that came so far but its inverse is one over that."},{"from":3133.63,"to":3136.6,"location":2,"content":"And then lastly, when normalizing this big,"},{"from":3136.6,"to":3139.96,"location":2,"content":"uh, product by the number of words,"},{"from":3139.96,"to":3143.99,"location":2,"content":"which is capital T. And the reason why we're doing that is because if we didn't do that,"},{"from":3143.99,"to":3148.2,"location":2,"content":"then perplexity would just get smaller and smaller as your corpus got bigger."},{"from":3148.2,"to":3151.14,"location":2,"content":"So we need to normalize by that factor."},{"from":3151.14,"to":3153.91,"location":2,"content":"So, you can actually show you that this, uh,"},{"from":3153.91,"to":3158.47,"location":2,"content":"perplexity is equal to the exponential of the cross-entropy loss J Theta."},{"from":3158.47,"to":3161.47,"location":2,"content":"So if you remember, cross-entropy loss J Theta is, uh,"},{"from":3161.47,"to":3164.3,"location":2,"content":"the training objective that we're using to train the language model."},{"from":3164.3,"to":3166.55,"location":2,"content":"And, uh, by rearranging things a little bit,"},{"from":3166.55,"to":3170.89,"location":2,"content":"you can see that perplexity is actually the exponential of the cross-entropy."},{"from":3170.89,"to":3172.75,"location":2,"content":"And this is a good thing, uh,"},{"from":3172.75,"to":3175.75,"location":2,"content":"because if we're training the language model to, uh,"},{"from":3175.75,"to":3178.9,"location":2,"content":"minimize the cross-entropy loss,"},{"from":3178.9,"to":3184.8,"location":2,"content":"then you are training it to optimize the perplexity as well."},{"from":3184.8,"to":3188.86,"location":2,"content":"So you should remember that the lower perplexity is better,"},{"from":3188.86,"to":3192.64,"location":2,"content":"uh, because perplexity is the inverse probability of the corpus."},{"from":3192.64,"to":3197.97,"location":2,"content":"So, uh, if you want your language model to assign high probability to the corpus, right?"},{"from":3197.97,"to":3201.6,"location":2,"content":"Then that means you want to get low perplexity."},{"from":3201.6,"to":3208.48,"location":2,"content":"Uh, any questions? [NOISE] Okay."},{"from":3208.48,"to":3216.22,"location":2,"content":"Uh, so RNNs have been pretty successful in recent years in improving perplexity."},{"from":3216.22,"to":3219.88,"location":2,"content":"So, uh, this is a results table from a recent,"},{"from":3219.88,"to":3223.63,"location":2,"content":"uh, Facebook research paper about RNN language models."},{"from":3223.63,"to":3226.6,"location":2,"content":"And, uh, you don't have to understand all of the details of this table,"},{"from":3226.6,"to":3228.05,"location":2,"content":"but what it's telling you is that,"},{"from":3228.05,"to":3230.78,"location":2,"content":"on the, uh, top where we have n gram language model."},{"from":3230.78,"to":3232.24,"location":2,"content":"And thessssssssssssn in the subsequent various,"},{"from":3232.24,"to":3235.74,"location":2,"content":"we have some increasingly complex and large RNNs."},{"from":3235.74,"to":3238.95,"location":2,"content":"And you can see that the perplexity numbers are decreasing,"},{"from":3238.95,"to":3240.47,"location":2,"content":"because lower is better."},{"from":3240.47,"to":3242.77,"location":2,"content":"So RNNs have been really great for"},{"from":3242.77,"to":3248.91,"location":2,"content":"making more effective language models in the last few years."},{"from":3248.91,"to":3251.7,"location":2,"content":"Okay. So to zoom out a little bit,"},{"from":3251.7,"to":3253.12,"location":2,"content":"you might be thinking, uh,"},{"from":3253.12,"to":3255.46,"location":2,"content":"why should I care about Language Modelling?"},{"from":3255.46,"to":3257.35,"location":2,"content":"Why is it important? I'd say there are"},{"from":3257.35,"to":3259.74,"location":2,"content":"two main reasons why Language Modelling is important."},{"from":3259.74,"to":3261.16,"location":2,"content":"Uh, so the first one is,"},{"from":3261.16,"to":3263.62,"location":2,"content":"that language modelling is a benchmark task that"},{"from":3263.62,"to":3266.77,"location":2,"content":"helps us measure our progress on understanding language."},{"from":3266.77,"to":3268.54,"location":2,"content":"So, you could view language modeling as"},{"from":3268.54,"to":3271.99,"location":2,"content":"a pretty general language understanding task, right?"},{"from":3271.99,"to":3275.43,"location":2,"content":"Because predicting what word comes next to given any,"},{"from":3275.43,"to":3277.8,"location":2,"content":"any kind of, uh, generic text."},{"from":3277.8,"to":3280.97,"location":2,"content":"Um, that's quite a difficult and general problem."},{"from":3280.97,"to":3283.33,"location":2,"content":"And in order to be good at language modelling,"},{"from":3283.33,"to":3285.34,"location":2,"content":"you have to understand a lot of things, right?"},{"from":3285.34,"to":3286.78,"location":2,"content":"You have to understand grammar,"},{"from":3286.78,"to":3288.11,"location":2,"content":"you have to understand syntax,"},{"from":3288.11,"to":3289.61,"location":2,"content":"and you have to understand,"},{"from":3289.61,"to":3291.11,"location":2,"content":"uh, logic and reasoning."},{"from":3291.11,"to":3292.57,"location":2,"content":"And you have to understand something about,"},{"from":3292.57,"to":3293.84,"location":2,"content":"you know, real-world knowledge."},{"from":3293.84,"to":3295.72,"location":2,"content":"You have to understand a lot of things in order to be"},{"from":3295.72,"to":3297.97,"location":2,"content":"able to do language modelling properly."},{"from":3297.97,"to":3299.53,"location":2,"content":"So, the reason why we care about it as"},{"from":3299.53,"to":3302.35,"location":2,"content":"a benchmark task is because if you're able to build a model,"},{"from":3302.35,"to":3305.05,"location":2,"content":"which is a better language model than the ones that came before it,"},{"from":3305.05,"to":3307.93,"location":2,"content":"then you must have made some kind of progress on at"},{"from":3307.93,"to":3311.62,"location":2,"content":"least some of those sub-components of natural language understanding."},{"from":3311.62,"to":3314.47,"location":2,"content":"So, another more tangible reason why you might"},{"from":3314.47,"to":3316.93,"location":2,"content":"care about language modelling is that it's a sub-component of"},{"from":3316.93,"to":3319.99,"location":2,"content":"many many NLP tasks especially those which involve"},{"from":3319.99,"to":3323.56,"location":2,"content":"generating text or estimating the probability of text."},{"from":3323.56,"to":3325.68,"location":2,"content":"So, here's a bunch of examples."},{"from":3325.68,"to":3327.22,"location":2,"content":"Uh, one is predictive typing."},{"from":3327.22,"to":3329.17,"location":2,"content":"That's the example that we showed at the beginning of the lecture"},{"from":3329.17,"to":3331.45,"location":2,"content":"with typing on your phone or searching on Google."},{"from":3331.45,"to":3335.18,"location":2,"content":"Uh, this is also very useful for people who have movement disabilities, uh,"},{"from":3335.18,"to":3339.59,"location":2,"content":"because they are these systems that help people communicate using fewer movements."},{"from":3339.59,"to":3341.92,"location":2,"content":"Uh, another example is speech recognition."},{"from":3341.92,"to":3343.6,"location":2,"content":"So, in speech recognition you have"},{"from":3343.6,"to":3345.82,"location":2,"content":"some kind of audio recording of a person saying something"},{"from":3345.82,"to":3349.97,"location":2,"content":"and often it's kind of noisy and hard to make out what they're saying and you need to,"},{"from":3349.97,"to":3351.7,"location":2,"content":"uh, figure out what words did they say."},{"from":3351.7,"to":3355.3,"location":2,"content":"So this an example where you have to estimate the probability of different,"},{"from":3355.3,"to":3358.21,"location":2,"content":"uh, different options of what, what it is they could have said."},{"from":3358.21,"to":3360.45,"location":2,"content":"And in the same way, handwriting recognition,"},{"from":3360.45,"to":3362.41,"location":2,"content":"is an example where there's a lot of noise"},{"from":3362.41,"to":3365.47,"location":2,"content":"and you have to figure out what the person intended to say."},{"from":3365.47,"to":3367.81,"location":2,"content":"Uh, spelling and grammar correction is yet"},{"from":3367.81,"to":3370.7,"location":2,"content":"another example where it's all about trying to figure out what someone meant."},{"from":3370.7,"to":3372.34,"location":2,"content":"And that means you actually understand how"},{"from":3372.34,"to":3374.7,"location":2,"content":"likely it is that they were saying different things."},{"from":3374.7,"to":3379.55,"location":2,"content":"Uh, an interesting, an interesting application is authorship identification."},{"from":3379.55,"to":3382.48,"location":2,"content":"So suppose that you have a piece of text and you're trying to"},{"from":3382.48,"to":3385.49,"location":2,"content":"figure out who likely wrote it and maybe you have,"},{"from":3385.49,"to":3389.83,"location":2,"content":"uh, several different authors and you have text written by those different authors."},{"from":3389.83,"to":3391.28,"location":2,"content":"So you could, for example,"},{"from":3391.28,"to":3394.72,"location":2,"content":"train a separate language model on each of the different authors' texts."},{"from":3394.72,"to":3396.16,"location":2,"content":"And then, because, remember,"},{"from":3396.16,"to":3399.8,"location":2,"content":"a language model can tell you the probability of a given piece of text."},{"from":3399.8,"to":3402.43,"location":2,"content":"Then you could ask all the different language models,"},{"from":3402.43,"to":3405.79,"location":2,"content":"um, how likely the texts and the question is,"},{"from":3405.79,"to":3409.72,"location":2,"content":"and then if a certain author's language model says that it's likely then that"},{"from":3409.72,"to":3415,"location":2,"content":"means that text the texts and the question is more likely to be written by that author."},{"from":3415,"to":3417.82,"location":2,"content":"Um, other examples include machine translation."},{"from":3417.82,"to":3419.2,"location":2,"content":"This is a huge, uh,"},{"from":3419.2,"to":3421.39,"location":2,"content":"application of language models,"},{"from":3421.39,"to":3423.57,"location":2,"content":"uh, because it's all about generating text."},{"from":3423.57,"to":3425.74,"location":2,"content":"Uh, similarly, summarization is"},{"from":3425.74,"to":3429.28,"location":2,"content":"a task where we need to generate some text given some input text."},{"from":3429.28,"to":3431.18,"location":2,"content":"Uh, dialogue as well,"},{"from":3431.18,"to":3434.98,"location":2,"content":"not all dialogue agents necessarily are RNN language models but you can"},{"from":3434.98,"to":3439.28,"location":2,"content":"build a dialogue agent that generates the text using an RNN language model."},{"from":3439.28,"to":3441.56,"location":2,"content":"And there are more examples as well."},{"from":3441.56,"to":3445.36,"location":2,"content":"Any questions on this? [LAUGHTER] Yep."},{"from":3445.36,"to":3467.88,"location":2,"content":"So, I know that [inaudible]"},{"from":3467.88,"to":3469.95,"location":2,"content":"Great question. So, the question was,"},{"from":3469.95,"to":3471.47,"location":2,"content":"uh, for some of these examples, uh,"},{"from":3471.47,"to":3475.32,"location":2,"content":"such as speech recognition or maybe [NOISE] image captioning,"},{"from":3475.32,"to":3479.29,"location":2,"content":"the input is audio or image or something that is not text, right?"},{"from":3479.29,"to":3481.78,"location":2,"content":"So, you can't represent it in the way that we've talked about so far."},{"from":3481.78,"to":3484.18,"location":2,"content":"Um, so, [NOISE] in those examples,"},{"from":3484.18,"to":3486.46,"location":2,"content":"you will have some way of representing the input,"},{"from":3486.46,"to":3488.72,"location":2,"content":"some way of encoding the audio or the image or whatever."},{"from":3488.72,"to":3493.32,"location":2,"content":"Uh, the reason I brought it up now in terms of language models is that that's the input,"},{"from":3493.32,"to":3495.68,"location":2,"content":"but you use the language model to get the output, right?"},{"from":3495.68,"to":3497.17,"location":2,"content":"So, the language model, [NOISE] uh, generates"},{"from":3497.17,"to":3499.34,"location":2,"content":"the output in the way that we saw earlier, uh,"},{"from":3499.34,"to":3502.12,"location":2,"content":"but we're gonna learn more about those conditional language [NOISE] models later."},{"from":3502.12,"to":3505.09,"location":2,"content":"[NOISE] Anyone else?"},{"from":3505.09,"to":3509.02,"location":2,"content":"[NOISE] Okay."},{"from":3509.02,"to":3512.97,"location":2,"content":"[NOISE] So, uh, here's a recap."},{"from":3512.97,"to":3516.73,"location":2,"content":"If I've lost you somewhere in this lecture, uh, or you got tired,"},{"from":3516.73,"to":3518.77,"location":2,"content":"um, now's a great time to jump back in"},{"from":3518.77,"to":3521.05,"location":2,"content":"because things are gonna get a little bit more accessible."},{"from":3521.05,"to":3523.05,"location":2,"content":"Okay. So, here's a recap of what we've done today."},{"from":3523.05,"to":3526.21,"location":2,"content":"Uh, a language model is a system that predicts the next word,"},{"from":3526.21,"to":3528.46,"location":2,"content":"[NOISE] and a recurrent neural network,"},{"from":3528.46,"to":3530.59,"location":2,"content":"is a new family, oh, new to us,"},{"from":3530.59,"to":3533.71,"location":2,"content":"a family of neural networks that takes sequential input"},{"from":3533.71,"to":3537.18,"location":2,"content":"of any length and it applies the same weights on every step,"},{"from":3537.18,"to":3539.62,"location":2,"content":"and it can optionally produce some kind of output on"},{"from":3539.62,"to":3542.02,"location":2,"content":"each step or some of the steps or none of the steps."},{"from":3542.02,"to":3544.95,"location":2,"content":"[NOISE] So, don't be confused."},{"from":3544.95,"to":3548.3,"location":2,"content":"A recurrent neural network is not [NOISE] the same thing as a language model."},{"from":3548.3,"to":3552.97,"location":2,"content":"Uh, we've seen today that an RNN is a great way to build a language model, but actually,"},{"from":3552.97,"to":3555.01,"location":2,"content":"it turns out that you can use RNNs for,"},{"from":3555.01,"to":3557.71,"location":2,"content":"uh, a lot of other different things that are not language modeling."},{"from":3557.71,"to":3559.84,"location":2,"content":"[NOISE] So, here's a few examples of that."},{"from":3559.84,"to":3564.09,"location":2,"content":"[NOISE] Uh, you can use an RNN to do a tagging task."},{"from":3564.09,"to":3566.32,"location":2,"content":"So, some examples of tagging tasks are"},{"from":3566.32,"to":3569.26,"location":2,"content":"part-of-speech tagging and named entity recognition."},{"from":3569.26,"to":3572.59,"location":2,"content":"So, pictured here is part-of-speech tagging, and this is the task."},{"from":3572.59,"to":3575.24,"location":2,"content":"We have some kind of input text such as, uh,"},{"from":3575.24,"to":3577.64,"location":2,"content":"the startled cat knocked over the vase,"},{"from":3577.64,"to":3579.39,"location":2,"content":"and your job is to, uh,"},{"from":3579.39,"to":3582.09,"location":2,"content":"label or tag each word with its part of speech."},{"from":3582.09,"to":3585.16,"location":2,"content":"So, for example, cat is a noun and knocked is a verb."},{"from":3585.16,"to":3588.2,"location":2,"content":"So, you can use an RNN to do this task in,"},{"from":3588.2,"to":3590.35,"location":2,"content":"in the way that we've pictured, which is that you, uh,"},{"from":3590.35,"to":3592.72,"location":2,"content":"feed the text into the RNN, [NOISE] and then,"},{"from":3592.72,"to":3593.91,"location":2,"content":"on each step of the RNN,"},{"from":3593.91,"to":3595.7,"location":2,"content":"you, uh, have an output,"},{"from":3595.7,"to":3597.79,"location":2,"content":"probably a distribution over what, uh,"},{"from":3597.79,"to":3601.78,"location":2,"content":"tag you think it is, and then, uh, you can tag it in that way."},{"from":3601.78,"to":3604.05,"location":2,"content":"And then, also for named entity recognition,"},{"from":3604.05,"to":3605.19,"location":2,"content":"that's all about, um,"},{"from":3605.19,"to":3608.09,"location":2,"content":"tagging each of the words with what named entity type they are."},{"from":3608.09,"to":3611.82,"location":2,"content":"So, you do it in the same way. [NOISE] Okay."},{"from":3611.82,"to":3613.47,"location":2,"content":"Here's another thing you can use RNNs for,"},{"from":3613.47,"to":3616.2,"location":2,"content":"uh, you can use them for sentence classification."},{"from":3616.2,"to":3619.08,"location":2,"content":"So, sentence classification is just a general term to mean"},{"from":3619.08,"to":3622.17,"location":2,"content":"any kind of task where you want to take sentence or other piece of text,"},{"from":3622.17,"to":3624.95,"location":2,"content":"and then, you want to classify it into one of several classes."},{"from":3624.95,"to":3628.12,"location":2,"content":"So, an example of that is sentiment classification."},{"from":3628.12,"to":3630.4,"location":2,"content":"Uh, sentiment classification is when you have some kind"},{"from":3630.4,"to":3632.68,"location":2,"content":"of input text such as, let's say, overall,"},{"from":3632.68,"to":3634.51,"location":2,"content":"I enjoyed the movie a lot, and then,"},{"from":3634.51,"to":3635.77,"location":2,"content":"you're trying to classify that as being"},{"from":3635.77,"to":3638.09,"location":2,"content":"positive or negative or [NOISE] neutral sentiment."},{"from":3638.09,"to":3640.09,"location":2,"content":"So, in this example, this is positive sentiment."},{"from":3640.09,"to":3645.4,"location":2,"content":"[NOISE] So, one way you might use an RNN to tackle this task is, uh,"},{"from":3645.4,"to":3649.45,"location":2,"content":"you might encode the text using the RNN, and then,"},{"from":3649.45,"to":3653.35,"location":2,"content":"really what you want is some kind of sentence encoding so that you"},{"from":3653.35,"to":3657.26,"location":2,"content":"can output your label for the sentence, right?"},{"from":3657.26,"to":3659.68,"location":2,"content":"And it'll be useful if you would have a single vector to"},{"from":3659.68,"to":3662.97,"location":2,"content":"represent the sentence rather than all of these separate vectors."},{"from":3662.97,"to":3664.87,"location":2,"content":"So, how would you do this?"},{"from":3664.87,"to":3667,"location":2,"content":"How would you get the sentence encoding from the RNN?"},{"from":3667,"to":3670.54,"location":2,"content":"[NOISE] Uh, one thing you could do [NOISE] is,"},{"from":3670.54,"to":3674.29,"location":2,"content":"you could use the final hidden state as your sentence encoding."},{"from":3674.29,"to":3678.46,"location":2,"content":"So, um, the reason why you might think this is a good idea is because,"},{"from":3678.46,"to":3679.81,"location":2,"content":"for example, in the RNN,"},{"from":3679.81,"to":3682.68,"location":2,"content":"we regard the, the final hidden state as,"},{"from":3682.68,"to":3685.74,"location":2,"content":"um, this is the thing you use to predict what's coming next, right?"},{"from":3685.74,"to":3688.3,"location":2,"content":"So, we're assuming that the final hidden state contains"},{"from":3688.3,"to":3691.47,"location":2,"content":"information about all of the text that has come so far, right?"},{"from":3691.47,"to":3694.99,"location":2,"content":"So, for that reason, you might suppose that this is a good sentence encoding,"},{"from":3694.99,"to":3696.46,"location":2,"content":"and we could use that [NOISE] to predict, you know,"},{"from":3696.46,"to":3699.04,"location":2,"content":"what, uh, what sentiment is this sentence."},{"from":3699.04,"to":3701.35,"location":2,"content":"And it turns out that usually, a better way to do this,"},{"from":3701.35,"to":3702.59,"location":2,"content":"usually a more effective way,"},{"from":3702.59,"to":3706.24,"location":2,"content":"is to do something like maybe take an element-wise max or"},{"from":3706.24,"to":3710.08,"location":2,"content":"an element-wise mean of all these hidden states to get your sentence encoding,"},{"from":3710.08,"to":3712.34,"location":2,"content":"um, [NOISE] and, uh,"},{"from":3712.34,"to":3714.64,"location":2,"content":"this tends to work better than just using the final hidden state."},{"from":3714.64,"to":3719.31,"location":2,"content":"[NOISE] Uh, there are some other more advanced things you can do as well."},{"from":3719.31,"to":3722.22,"location":2,"content":"Okay. [NOISE] Another thing that you can use RNNs for"},{"from":3722.22,"to":3725.34,"location":2,"content":"is as a general purpose encoder module."},{"from":3725.34,"to":3728.47,"location":2,"content":"Uh, so, here's an example that's question answering,"},{"from":3728.47,"to":3730.48,"location":2,"content":"but really this idea of RNNs as"},{"from":3730.48,"to":3735.09,"location":2,"content":"a general purpose encoder module is very common [NOISE] and use it in lots of different,"},{"from":3735.09,"to":3737.59,"location":2,"content":"um, deep learning [NOISE] architectures for NLP."},{"from":3737.59,"to":3741.18,"location":2,"content":"[NOISE] So, here's an example which is question answering."},{"from":3741.18,"to":3743.41,"location":2,"content":"Uh, so, let's suppose that the, the task is,"},{"from":3743.41,"to":3744.67,"location":2,"content":"you've got some kind of context,"},{"from":3744.67,"to":3746.11,"location":2,"content":"which, in this, uh, situation,"},{"from":3746.11,"to":3749.36,"location":2,"content":"is the Wikipedia article on Beethoven, and then,"},{"from":3749.36,"to":3751.21,"location":2,"content":"you have a question which is asking,"},{"from":3751.21,"to":3753.07,"location":2,"content":"what nationality was Beethoven?"},{"from":3753.07,"to":3756.4,"location":2,"content":"Uh, and this is actually taken from the SQuAD Challenge,"},{"from":3756.4,"to":3758.68,"location":2,"content":"which is the subject of the Default Final Project."},{"from":3758.68,"to":3761.77,"location":2,"content":"So, um, if you choose to do- to do the Default Final Project,"},{"from":3761.77,"to":3764.95,"location":2,"content":"you're going to be building systems that solve this problem."},{"from":3764.95,"to":3769.93,"location":2,"content":"So, what you might do is, you might use an RNN to process the question,"},{"from":3769.93,"to":3771.97,"location":2,"content":"what nationality was [NOISE] Beethoven?"},{"from":3771.97,"to":3776.22,"location":2,"content":"And then, you might use those hidden states that you get from this, uh,"},{"from":3776.22,"to":3780.28,"location":2,"content":"RNN of the question as a representation of the question."},{"from":3780.28,"to":3783.58,"location":2,"content":"And I'm being intentionally vague here [NOISE] about what might happen next, uh,"},{"from":3783.58,"to":3785.2,"location":2,"content":"but the idea is that you have [NOISE]"},{"from":3785.2,"to":3788.5,"location":2,"content":"both the context and the question are going to be fed some way,"},{"from":3788.5,"to":3790.9,"location":2,"content":"and maybe you'll use an RNN on context as well,"},{"from":3790.9,"to":3794.49,"location":2,"content":"and you're going to have lots more neural architecture in order to get your answer,"},{"from":3794.49,"to":3795.89,"location":2,"content":"which is, uh, German."},{"from":3795.89,"to":3801.36,"location":2,"content":"So, the point here is that the RNN is acting as an encoder for the question,"},{"from":3801.36,"to":3803.92,"location":2,"content":"that is, the hidden states that you get from running"},{"from":3803.92,"to":3806.65,"location":2,"content":"the RNN over the question, represent the question."},{"from":3806.65,"to":3811.81,"location":2,"content":"[NOISE] Uh, so, the encoder is part of a larger neural system,"},{"from":3811.81,"to":3813.94,"location":2,"content":"[NOISE] and it's the, the hidden states themselves"},{"from":3813.94,"to":3816.3,"location":2,"content":"that you're interested in because they contain the information."},{"from":3816.3,"to":3818.14,"location":2,"content":"So, you could have, um, taken,"},{"from":3818.14,"to":3819.7,"location":2,"content":"uh, element-wise max or mean,"},{"from":3819.7,"to":3821.01,"location":2,"content":"like we showed in the previous slide,"},{"from":3821.01,"to":3824.17,"location":2,"content":"to get a single vector for the question, but often, you don't do that."},{"from":3824.17,"to":3828.16,"location":2,"content":"Often, you'll, uh, do something else which uses the hidden states directly."},{"from":3828.16,"to":3833.44,"location":2,"content":"So, the general point here is that RNNs are quite powerful as a way to represent,"},{"from":3833.44,"to":3834.93,"location":2,"content":"uh, a sequence of text,"},{"from":3834.93,"to":3838.17,"location":2,"content":"uh, for further computation."},{"from":3838.17,"to":3842.93,"location":2,"content":"Okay. Last example. So, going back to RNN language models again, [NOISE] uh,"},{"from":3842.93,"to":3844.57,"location":2,"content":"they can be used to generate text,"},{"from":3844.57,"to":3847.3,"location":2,"content":"and there are lots of different, uh, applications for this."},{"from":3847.3,"to":3851.02,"location":2,"content":"So, for example, speech recognition, uh, you will have your input,"},{"from":3851.02,"to":3853.34,"location":2,"content":"which is the audio, and as a student asked earlier,"},{"from":3853.34,"to":3855.86,"location":2,"content":"this will be, uh, represented in some way,"},{"from":3855.86,"to":3859.48,"location":2,"content":"and then, uh, maybe you'll do a neural encoding of that, [NOISE] and then,"},{"from":3859.48,"to":3862.61,"location":2,"content":"you use your RNN language model to generate the output,"},{"from":3862.61,"to":3864.35,"location":2,"content":"which, in this case, is going to be a transcription"},{"from":3864.35,"to":3866.28,"location":2,"content":"of what the audio recording is saying."},{"from":3866.28,"to":3868.03,"location":2,"content":"So, you will have some way of conditioning,"},{"from":3868.03,"to":3869.83,"location":2,"content":"and we're gonna talk more about how this works, uh,"},{"from":3869.83,"to":3871.78,"location":2,"content":"in a later lecture, but you have some way of"},{"from":3871.78,"to":3875.23,"location":2,"content":"conditioning your RNN language model on the input."},{"from":3875.23,"to":3878.92,"location":2,"content":"So, you'll use that to generate your text, [NOISE] and in this case,"},{"from":3878.92,"to":3881.34,"location":2,"content":"the utterance might be something like, what's the weather,"},{"from":3881.34,"to":3884.59,"location":2,"content":"question mark. [OVERLAPPING] [NOISE]"},{"from":3884.59,"to":3894.22,"location":2,"content":"Yeah. [NOISE]"},{"from":3894.22,"to":3898.12,"location":2,"content":"In speech recognition, [inaudible]."},{"from":3898.12,"to":3900.1,"location":2,"content":"Okay. So, the question is, in speech recognition,"},{"from":3900.1,"to":3902.76,"location":2,"content":"we often use word error rates to evaluate,"},{"from":3902.76,"to":3904.69,"location":2,"content":"but would you use perplexity to evaluate?"},{"from":3904.69,"to":3907.69,"location":2,"content":"[NOISE] Um, I don't actually know much about that. Do you know, Chris,"},{"from":3907.69,"to":3909.25,"location":2,"content":"what they use in, uh,"},{"from":3909.25,"to":3915.01,"location":2,"content":"speech recognition as an eval metric? [NOISE]"},{"from":3915.01,"to":3923.59,"location":2,"content":"[inaudible] word error rate [inaudible]."},{"from":3923.59,"to":3925.38,"location":2,"content":"The answer is, you often use WER,"},{"from":3925.38,"to":3927.55,"location":2,"content":"uh, for eval, but you might also use perplexity."},{"from":3927.55,"to":3929.5,"location":2,"content":"Yeah. Any other questions?"},{"from":3929.5,"to":3935.57,"location":2,"content":"[NOISE] Okay. So, um,"},{"from":3935.57,"to":3938.35,"location":2,"content":"this is an example of a conditional language model,"},{"from":3938.35,"to":3939.97,"location":2,"content":"and it's called a conditional language model"},{"from":3939.97,"to":3941.72,"location":2,"content":"because we have the language model component,"},{"from":3941.72,"to":3944.74,"location":2,"content":"but crucially, we're conditioning it on some kind of input."},{"from":3944.74,"to":3948.58,"location":2,"content":"So, unlike the, uh, fun examples like with the Harry Potter text where we were just, uh,"},{"from":3948.58,"to":3951.46,"location":2,"content":"generating text basically unconditionally, you know,"},{"from":3951.46,"to":3952.75,"location":2,"content":"we trained it on the training data, and then,"},{"from":3952.75,"to":3954.82,"location":2,"content":"we just started [NOISE] with some kind of random seed,"},{"from":3954.82,"to":3956.3,"location":2,"content":"and then, it generates unconditionally."},{"from":3956.3,"to":3958.54,"location":2,"content":"This is called a conditional language model"},{"from":3958.54,"to":3961.61,"location":2,"content":"because there's some kind of input that we need to condition on."},{"from":3961.61,"to":3965.98,"location":2,"content":"Uh, machine translation is an example [NOISE] also of a conditional language model,"},{"from":3965.98,"to":3967.78,"location":2,"content":"and we're going to see that in much more detail in"},{"from":3967.78,"to":3969.52,"location":2,"content":"the lecture next week on machine translation."},{"from":3969.52,"to":3972.89,"location":2,"content":"[NOISE] All right. Are there any more questions?"},{"from":3972.89,"to":3974.32,"location":2,"content":"You have a bit of extra time, I think."},{"from":3974.32,"to":3977.66,"location":2,"content":"[NOISE] Yeah."},{"from":3977.66,"to":3980.35,"location":2,"content":"I have a question about RNNs in general."},{"from":3980.35,"to":3985.34,"location":2,"content":"[NOISE] Do people ever combine the RNN,"},{"from":3985.34,"to":3987.22,"location":2,"content":"uh, patterns of architecture,"},{"from":3987.22,"to":3989.97,"location":2,"content":"um, with other neural networks?"},{"from":3989.97,"to":3991.89,"location":2,"content":"Say, [NOISE] you have, um, you know,"},{"from":3991.89,"to":3994.28,"location":2,"content":"N previous layers that could be doing anything,"},{"from":3994.28,"to":3995.41,"location":2,"content":"and at the end of your network,"},{"from":3995.41,"to":3996.88,"location":2,"content":"you wanna run them through,"},{"from":3996.88,"to":3999.16,"location":2,"content":"uh, five recurrent layers."},{"from":3999.16,"to":4000.81,"location":2,"content":"Do people mix and match like that,"},{"from":4000.81,"to":4002.19,"location":2,"content":"or these, uh, [inaudible]. [NOISE]"},{"from":4002.19,"to":4006.09,"location":2,"content":"Uh, the question is,"},{"from":4006.09,"to":4008.58,"location":2,"content":"do you ever combine RNN for the other types of architecture?"},{"from":4008.58,"to":4009.87,"location":2,"content":"So, I think the answer is yes."},{"from":4009.87,"to":4011.59,"location":2,"content":"[NOISE] Uh, you might, [NOISE] you know, uh,"},{"from":4011.59,"to":4015.21,"location":2,"content":"have- you might have other types of architectures, uh,"},{"from":4015.21,"to":4018.54,"location":2,"content":"to produce the vectors that are going to be the input to RNN,"},{"from":4018.54,"to":4020.28,"location":2,"content":"or you might use the output of your RNN"},{"from":4020.28,"to":4026.39,"location":2,"content":"[NOISE] and feed that into a different type of neural network."},{"from":4026.39,"to":4028.62,"location":2,"content":"So, yes. [NOISE] Any other questions?"},{"from":4028.62,"to":4031.82,"location":2,"content":"[NOISE] Okay."},{"from":4031.82,"to":4035.51,"location":2,"content":"Uh, so, before we finish, uh, I have a note on terminology."},{"from":4035.51,"to":4037.49,"location":2,"content":"Uh, when you're reading papers,"},{"from":4037.49,"to":4040.91,"location":2,"content":"you might find often this phrase vanilla RNN,"},{"from":4040.91,"to":4043.07,"location":2,"content":"and when you see the phrase vanilla RNN,"},{"from":4043.07,"to":4044.53,"location":2,"content":"that usually means, uh,"},{"from":4044.53,"to":4046.91,"location":2,"content":"the RNNs that are described in this lecture."},{"from":4046.91,"to":4050.46,"location":2,"content":"So, the reason why those are called vanilla RNNs is"},{"from":4050.46,"to":4054.76,"location":2,"content":"because there are actually other more complex kinds of RNN flavors."},{"from":4054.76,"to":4058.01,"location":2,"content":"So, for example, there's GRU and LSTM,"},{"from":4058.01,"to":4060.33,"location":2,"content":"and we're gonna learn about both of those next week."},{"from":4060.33,"to":4062.61,"location":2,"content":"And another thing we're going to learn about next week"},{"from":4062.61,"to":4065.09,"location":2,"content":"[NOISE] is that you can actually get some multi-layer RNNs,"},{"from":4065.09,"to":4068.25,"location":2,"content":"which is when you stack multiple RNNs on top of each other."},{"from":4068.25,"to":4070.93,"location":2,"content":"[NOISE] So, uh, you're gonna learn about those,"},{"from":4070.93,"to":4073.88,"location":2,"content":"but we hope that by the time you reach the end of this course,"},{"from":4073.88,"to":4076.91,"location":2,"content":"you're going to be able to read a research paper and see a phrase like"},{"from":4076.91,"to":4081.15,"location":2,"content":"stacked bidirectional LSTM with residual connections and self-attention,"},{"from":4081.15,"to":4082.68,"location":2,"content":"and you'll know exactly what that is."},{"from":4082.68,"to":4084.84,"location":2,"content":"[NOISE] That's just an RNN with all of the toppings."},{"from":4084.84,"to":4087.84,"location":2,"content":"[LAUGHTER] All right. Thank you. That's it for today."},{"from":4087.84,"to":4095.91,"location":2,"content":"[NOISE] Uh, next time- [APPLAUSE] next time,"},{"from":4095.91,"to":4098.34,"location":2,"content":"we're learning about problems [NOISE] and fancy RNNs."},{"from":4098.34,"to":4104.77,"location":2,"content":"[NOISE]"}]} \ No newline at end of file diff --git a/bcc-en/7.bcc b/bcc-en/7.bcc new file mode 100644 index 0000000000000000000000000000000000000000..e840e05b9009463e524e33c8e27929a2f6239fb3 --- /dev/null +++ b/bcc-en/7.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":4.7,"to":7.14,"location":2,"content":"Hi, everyone. I'm Abby."},{"from":7.14,"to":8.36,"location":2,"content":"If you weren't here last week,"},{"from":8.36,"to":10.19,"location":2,"content":"I'm the head TA of this course."},{"from":10.19,"to":13.32,"location":2,"content":"And this is the second [NOISE] of three lectures that I'm"},{"from":13.32,"to":17.36,"location":2,"content":"going to be giving on RNNs and related topics."},{"from":17.36,"to":20.43,"location":2,"content":"Okay. So, welcome to week four."},{"from":20.43,"to":22.96,"location":2,"content":"Today, we're going to be learning about vanishing gradients,"},{"from":22.96,"to":25.12,"location":2,"content":"and some more complex types of RNNs."},{"from":25.12,"to":26.82,"location":2,"content":"So, before we get started,"},{"from":26.82,"to":28,"location":2,"content":"I've got a few announcements."},{"from":28,"to":33.15,"location":2,"content":"Uh, the first announcement is that assignment four is released today, uh,"},{"from":33.15,"to":35.84,"location":2,"content":"it's due Thursday of next week, not Tuesday,"},{"from":35.84,"to":39.8,"location":2,"content":"so that means you have two days more to do it than you did for all the other homeworks."},{"from":39.8,"to":41.41,"location":2,"content":"And the reason for that is assignment four is"},{"from":41.41,"to":43.36,"location":2,"content":"probably more work than the other homework so far,"},{"from":43.36,"to":45.41,"location":2,"content":"so don't be surprised by that."},{"from":45.41,"to":48.58,"location":2,"content":"Uh, assignment four is all about Neural Machine Translation."},{"from":48.58,"to":52.13,"location":2,"content":"Uh, we're gonna learn about NMT on Thursday's lecture this week."},{"from":52.13,"to":54.27,"location":2,"content":"And, uh, this is really exciting,"},{"from":54.27,"to":57.6,"location":2,"content":"because actually CS 224 has never had an NMT assignment before,"},{"from":57.6,"to":58.74,"location":2,"content":"so this is all new this year,"},{"from":58.74,"to":62.42,"location":2,"content":"and you're gonna be the first year students who are going to be doing an NMT assignment."},{"from":62.42,"to":64.91,"location":2,"content":"Uh, something else that's different about"},{"from":64.91,"to":67.61,"location":2,"content":"assignment four is that you're going to be using Azure, which is, uh,"},{"from":67.61,"to":69.17,"location":2,"content":"a cloud computing service,"},{"from":69.17,"to":73.14,"location":2,"content":"in order to train your NMT systems on a virtual machine with a GPU."},{"from":73.14,"to":77.36,"location":2,"content":"And, uh, this is necessary in order to be able to do it in a reasonable amount of time."},{"from":77.36,"to":79.25,"location":2,"content":"So, I have a warning which is,"},{"from":79.25,"to":80.93,"location":2,"content":"if you're a person who perhaps doesn't have, ah,"},{"from":80.93,"to":84.14,"location":2,"content":"learnt- a lot of experience working on remote machines,"},{"from":84.14,"to":86.36,"location":2,"content":"so for example if you're not very familiar with SSH,"},{"from":86.36,"to":88.88,"location":2,"content":"or tmux, or remote text editing,"},{"from":88.88,"to":91.58,"location":2,"content":"then I advise you to budget some extra time for assignment four,"},{"from":91.58,"to":94.94,"location":2,"content":"because that's probably gonna take you a little while to set up and get used to."},{"from":94.94,"to":97.28,"location":2,"content":"So, again, I'm going to emphasize,"},{"from":97.28,"to":100.25,"location":2,"content":"do get started early on assignment four, because, uh,"},{"from":100.25,"to":103.78,"location":2,"content":"the NMT system takes about four hours to train on your virtual machine,"},{"from":103.78,"to":107.23,"location":2,"content":"so you really can't start it the night before and expect to get it in on time."},{"from":107.23,"to":111.25,"location":2,"content":"Uh, and assignment four is really quite a lot more complicated than assignment three."},{"from":111.25,"to":115.25,"location":2,"content":"So, uh, don't get into a false sense of security if you found assignment three easy."},{"from":115.25,"to":120.64,"location":2,"content":"Um, so Thursday's slides on NMT are ready on the website today,"},{"from":120.64,"to":122.61,"location":2,"content":"so you can even start looking at it today if you"},{"from":122.61,"to":125.91,"location":2,"content":"want- if you wanna get started on assignment four early."},{"from":125.91,"to":128.31,"location":2,"content":"Uh, so, I have a few more announcements, uh,"},{"from":128.31,"to":130.08,"location":2,"content":"on the subject of projects, uh,"},{"from":130.08,"to":131.86,"location":2,"content":"next week's lectures are going to be all about projects."},{"from":131.86,"to":134.63,"location":2,"content":"So, you're going to hear about, uh, question answering,"},{"from":134.63,"to":136.44,"location":2,"content":"and the default final projects,"},{"from":136.44,"to":138.62,"location":2,"content":"and then you're also gonna get some tips about how you might,"},{"from":138.62,"to":141.21,"location":2,"content":"uh, choose and define your own custom projects."},{"from":141.21,"to":144.18,"location":2,"content":"So, it's fine if you're not thinking about a project this week, that's okay."},{"from":144.18,"to":147.13,"location":2,"content":"You can delay until next week to start thinking about it for the first time."},{"from":147.13,"to":149.63,"location":2,"content":"But if you are a person who is already thinking about your projects,"},{"from":149.63,"to":152.2,"location":2,"content":"for example, if you're trying to choose your custom projects, uh,"},{"from":152.2,"to":154.29,"location":2,"content":"then you should check out the website's project page,"},{"from":154.29,"to":156.23,"location":2,"content":"because it has quite a lot of information about, uh,"},{"from":156.23,"to":159.17,"location":2,"content":"how to choose your projects, and also some inspiration."},{"from":159.17,"to":161.65,"location":2,"content":"And that includes- we've collected some, uh,"},{"from":161.65,"to":164.65,"location":2,"content":"project ideas from various members of the Stanford AI Lab."},{"from":164.65,"to":168.16,"location":2,"content":"So, these are faculty and PhD students and postdocs,"},{"from":168.16,"to":169.76,"location":2,"content":"who have ideas for, uh,"},{"from":169.76,"to":171.56,"location":2,"content":"NLP deep learning projects that they would like"},{"from":171.56,"to":174.15,"location":2,"content":"CS224n students such as yourself to work on."},{"from":174.15,"to":177.09,"location":2,"content":"So, especially, if you're looking to maybe get into research later,"},{"from":177.09,"to":179.18,"location":2,"content":"this is a really great opportunity, uh,"},{"from":179.18,"to":180.91,"location":2,"content":"to work with someone in the Stanford AI Lab,"},{"from":180.91,"to":183.7,"location":2,"content":"and maybe get some mentorship as well."},{"from":183.7,"to":186.84,"location":2,"content":"Okay. So here's an overview."},{"from":186.84,"to":190.08,"location":2,"content":"Uh, last week, we learned about Recurrent Neural Networks,"},{"from":190.08,"to":192.72,"location":2,"content":"um, we learned about why they're really great for Language Modeling."},{"from":192.72,"to":195.34,"location":2,"content":"And today, we're gonna learn about some problems with RNNs,"},{"from":195.34,"to":196.88,"location":2,"content":"and we're gonna learn about how to fix them."},{"from":196.88,"to":202.03,"location":2,"content":"And this is gonna motiva- motivate us to learn about some more complex RNN variants."},{"from":202.03,"to":204.3,"location":2,"content":"And then, uh, next lecture on Thursday,"},{"from":204.3,"to":207.57,"location":2,"content":"we're going to, uh, have some more application-based, uh, contents,"},{"from":207.57,"to":209.75,"location":2,"content":"so we are going to be learning about Neural Machine Translation,"},{"from":209.75,"to":211.7,"location":2,"content":"which is a really important task in, uh,"},{"from":211.7,"to":213.83,"location":2,"content":"NLP and deep learning, and in particular,"},{"from":213.83,"to":217.44,"location":2,"content":"we're gonna learn about this architecture called sequence-to-sequence with attention."},{"from":217.44,"to":220.08,"location":2,"content":"But in more detail,"},{"from":220.08,"to":221.64,"location":2,"content":"today's lecture, uh, first,"},{"from":221.64,"to":223.65,"location":2,"content":"we are going to learn about the vanishing gradient problem."},{"from":223.65,"to":226.22,"location":2,"content":"And this is gonna motivate us to learn about two new types of"},{"from":226.22,"to":228.89,"location":2,"content":"RNN called Long Short-Term Memory,"},{"from":228.89,"to":230.85,"location":2,"content":"and Gated Recurrent Unit."},{"from":230.85,"to":232.94,"location":2,"content":"We're also going to learn about some other kind of"},{"from":232.94,"to":235.64,"location":2,"content":"miscellaneous fixes for the vanishing gradient problem,"},{"from":235.64,"to":237.19,"location":2,"content":"or the exploding gradient problem."},{"from":237.19,"to":238.37,"location":2,"content":"Uh, so in particular,"},{"from":238.37,"to":239.79,"location":2,"content":"we're going to learn about gradient clipping,"},{"from":239.79,"to":242.19,"location":2,"content":"which is, uh, fairly simple, but quite important."},{"from":242.19,"to":244.71,"location":2,"content":"Uh, we're also going to learn about skip connections,"},{"from":244.71,"to":247.08,"location":2,"content":"which is a fairly new neural architecture,"},{"from":247.08,"to":248.15,"location":2,"content":"which tries to, uh,"},{"from":248.15,"to":249.35,"location":2,"content":"fix the vanishing gradient problem."},{"from":249.35,"to":251.64,"location":2,"content":"[NOISE] And then, at the end of the lecture,"},{"from":251.64,"to":254.21,"location":2,"content":"we're gonna learn about some more fancy RNN variants such as, uh,"},{"from":254.21,"to":257.78,"location":2,"content":"bidirectional RN- RNNs, those are the ones which go not just left to right,"},{"from":257.78,"to":258.98,"location":2,"content":"but also right to left,"},{"from":258.98,"to":261.74,"location":2,"content":"and we're going to learn about multi-layer RNNs."},{"from":261.74,"to":264.88,"location":2,"content":"And that's when you stack multiple RNNs on top of each other."},{"from":264.88,"to":267.56,"location":2,"content":"So, there's a lot of important definitions today."},{"from":267.56,"to":269.87,"location":2,"content":"Um, so, you're gonna find that the information in"},{"from":269.87,"to":271.16,"location":2,"content":"this lecture is pretty important for"},{"from":271.16,"to":275.45,"location":2,"content":"assignment four and probably for your project as well."},{"from":275.45,"to":280.25,"location":2,"content":"Okay. So, let's get started thinking about the vanishing gradients."},{"from":280.25,"to":282.35,"location":2,"content":"Uh, so here we have an RNN,"},{"from":282.35,"to":284.39,"location":2,"content":"with, let say, ah, four steps,"},{"from":284.39,"to":286.95,"location":2,"content":"and suppose that we have some kind of loss that's, uh,"},{"from":286.95,"to":290.92,"location":2,"content":"J4, and that's computed based on the four hidden states."},{"from":290.92,"to":296.3,"location":2,"content":"So, let's suppose we're interested in asking what is the derivative of this loss J4,"},{"from":296.3,"to":298.34,"location":2,"content":"with respect to the hidden states,"},{"from":298.34,"to":300.87,"location":2,"content":"uh, h1, the first hidden state?"},{"from":300.87,"to":303.39,"location":2,"content":"So, I'm representing that with this, uh,"},{"from":303.39,"to":305.72,"location":2,"content":"blue arrow notation to kind of represent how we have"},{"from":305.72,"to":309.07,"location":2,"content":"to make the gradients flow backwards in order to complete this."},{"from":309.07,"to":311.75,"location":2,"content":"So, if we're interested in what this gradient is,"},{"from":311.75,"to":313.85,"location":2,"content":"we can apply the chain rule and say, \"Well,"},{"from":313.85,"to":315.04,"location":2,"content":"it's the product of the, uh,"},{"from":315.04,"to":317.56,"location":2,"content":"gradient of the loss with respect to h2,"},{"from":317.56,"to":320.98,"location":2,"content":"and then gradient of h2, with respect to h1.\""},{"from":320.98,"to":323.67,"location":2,"content":"And then, similarly, we can decompose that"},{"from":323.67,"to":327.25,"location":2,"content":"again using the chain rule, and we can do it again."},{"from":327.25,"to":331.79,"location":2,"content":"So, what we've done here is we've decomposed the gradient that we were interested in,"},{"from":331.79,"to":335.38,"location":2,"content":"into the products of these various intermediate gradients."},{"from":335.38,"to":339.29,"location":2,"content":"And in particular, we're seeing all these ht by ht minus 1,"},{"from":339.29,"to":341.98,"location":2,"content":"uh, adjacent gradients of the hidden states."},{"from":341.98,"to":343.91,"location":2,"content":"So, the thing I want to ask you is,"},{"from":343.91,"to":347.13,"location":2,"content":"what happens if these gradients are small?"},{"from":347.13,"to":349.02,"location":2,"content":"Given that there's a lot of them,"},{"from":349.02,"to":352.13,"location":2,"content":"uh, what happens if they're small in magnitude?"},{"from":352.13,"to":356.87,"location":2,"content":"So, the overall problem of the vanishing gradient problem,"},{"from":356.87,"to":359.34,"location":2,"content":"is that when these gradients are small,"},{"from":359.34,"to":362.52,"location":2,"content":"then our overall gradient is gonna get smaller and smaller,"},{"from":362.52,"to":364.38,"location":2,"content":"as it back propagates further."},{"from":364.38,"to":369.15,"location":2,"content":"Because the accumulated gradient is the product of all of these intermediate gradients."},{"from":369.15,"to":371.27,"location":2,"content":"And when you multiply something by something small,"},{"from":371.27,"to":372.97,"location":2,"content":"then the whole thing gets smaller."},{"from":372.97,"to":375.22,"location":2,"content":"So, that's what I'm representing here with these, uh,"},{"from":375.22,"to":378.29,"location":2,"content":"smaller and smaller blue arrows going backwards."},{"from":378.29,"to":381.56,"location":2,"content":"So, that's the general idea of the vanishing gradient problem."},{"from":381.56,"to":383.74,"location":2,"content":"Here's a slightly more formal definition."},{"from":383.74,"to":385.9,"location":2,"content":"So, if you remember from last time,"},{"from":385.9,"to":388.1,"location":2,"content":"uh, if we have a null RNN,"},{"from":388.1,"to":390.49,"location":2,"content":"then the hidden state ht is"},{"from":390.49,"to":393.49,"location":2,"content":"computed as a function of the previous hidden state ht minus 1,"},{"from":393.49,"to":394.81,"location":2,"content":"and the current input xt."},{"from":394.81,"to":397.88,"location":2,"content":"Uh, so you might remember in the previous lecture we"},{"from":397.88,"to":401.07,"location":2,"content":"said that xt were one-hot vectors representing words,"},{"from":401.07,"to":402.62,"location":2,"content":"and then ET is the embedding."},{"from":402.62,"to":404.06,"location":2,"content":"Uh, this lecture we're going to be,"},{"from":404.06,"to":405.11,"location":2,"content":"uh, getting rid of that detail,"},{"from":405.11,"to":407.06,"location":2,"content":"and we're just gonna be thinking very abstractly about"},{"from":407.06,"to":409.45,"location":2,"content":"an RNN that has some kind of input xt,"},{"from":409.45,"to":411.21,"location":2,"content":"and xt is just any kind of vector."},{"from":411.21,"to":412.26,"location":2,"content":"Probably a dense vector,"},{"from":412.26,"to":413.93,"location":2,"content":"but you know, it could be words or not."},{"from":413.93,"to":415.5,"location":2,"content":"It could be one-hot or dense."},{"from":415.5,"to":417.81,"location":2,"content":"Uh, but that's just the input."},{"from":417.81,"to":420.21,"location":2,"content":"So, that's the, uh,"},{"from":420.21,"to":423.37,"location":2,"content":"the definition that we learned last time for Vanilla RNNs."},{"from":423.37,"to":425.93,"location":2,"content":"So, this means that the derivative of ht,"},{"from":425.93,"to":428.48,"location":2,"content":"hidden state on step t with respect to the previous hidden state,"},{"from":428.48,"to":430.07,"location":2,"content":"uh, is this expression here."},{"from":430.07,"to":433.88,"location":2,"content":"Uh, so this is just an application of the chain rule, and, uh,"},{"from":433.88,"to":436.01,"location":2,"content":"if you looked long enough or refer back to"},{"from":436.01,"to":438.59,"location":2,"content":"the backprop lecture you'll see, uh, that that make sense."},{"from":438.59,"to":440.58,"location":2,"content":"So, in particular, we're, um,"},{"from":440.58,"to":443.21,"location":2,"content":"multiplying by Wh at the end, uh,"},{"from":443.21,"to":447.4,"location":2,"content":"because we have the multiplication of Wh and ht minus 1 on the inside."},{"from":447.4,"to":450.59,"location":2,"content":"Okay. So, if you remember, on the previous slide,"},{"from":450.59,"to":453.65,"location":2,"content":"we were thinking about what's the gradient of the loss on some step,"},{"from":453.65,"to":454.91,"location":2,"content":"step i I'd say,"},{"from":454.91,"to":456.98,"location":2,"content":"with respect to a hidden state hj,"},{"from":456.98,"to":458.8,"location":2,"content":"on some previous step j."},{"from":458.8,"to":461.95,"location":2,"content":"And maybe J is quite a few steps before i."},{"from":461.95,"to":464.24,"location":2,"content":"So, we can now write this,"},{"from":464.24,"to":466.13,"location":2,"content":"uh, in the following way."},{"from":466.13,"to":468.45,"location":2,"content":"So just by applying the chain rule,"},{"from":468.45,"to":471.37,"location":2,"content":"now on the first line we're saying that this derivative that we're interested in"},{"from":471.37,"to":474.92,"location":2,"content":"can be decomposed into the derivative with respect to step i,"},{"from":474.92,"to":476.27,"location":2,"content":"which is kind of the last step,"},{"from":476.27,"to":480.26,"location":2,"content":"and then do all of those intermediate gradients of the adjacent hidden states as well."},{"from":480.26,"to":484.1,"location":2,"content":"So, that- that first slide is just exactly the same thing as we were looking at on the,"},{"from":484.1,"to":488.3,"location":2,"content":"uh, the picture, uh, the diagram on the previous slide."},{"from":488.3,"to":491.44,"location":2,"content":"Okay. And then, given that we figured out what is, uh,"},{"from":491.44,"to":494.03,"location":2,"content":"dht by dht minus one,"},{"from":494.03,"to":495.05,"location":2,"content":"ah, further on the slide,"},{"from":495.05,"to":496.77,"location":2,"content":"then we can just substitute that in."},{"from":496.77,"to":499.37,"location":2,"content":"So, what we're finding is that this overall gradient that we're"},{"from":499.37,"to":501.8,"location":2,"content":"interested in, in particular,"},{"from":501.8,"to":503.44,"location":2,"content":"has this term, uh,"},{"from":503.44,"to":506.79,"location":2,"content":"Wh, the weight matrix, and it's, uh,"},{"from":506.79,"to":509.02,"location":2,"content":"multiplied by itself, i minus j times,"},{"from":509.02,"to":512.53,"location":2,"content":"because there's i minus j many steps between, uh,"},{"from":512.53,"to":513.83,"location":2,"content":"step j and step i,"},{"from":513.83,"to":517.52,"location":2,"content":"which is the- the distance that we're traveling with this gradient."},{"from":517.52,"to":519.75,"location":2,"content":"So, the big problem here is,"},{"from":519.75,"to":522.66,"location":2,"content":"if this weight matrix Wh is small,"},{"from":522.66,"to":525.45,"location":2,"content":"then this term is gonna get vanishingly small,"},{"from":525.45,"to":529.96,"location":2,"content":"exponentially small, as i and j get further apart."},{"from":529.96,"to":533.74,"location":2,"content":"So, to give this a little more detail, uh,"},{"from":533.74,"to":535.15,"location":2,"content":"we can think about the, uh,"},{"from":535.15,"to":538.36,"location":2,"content":"L2 matrix norms of all of these matrices, right?"},{"from":538.36,"to":543.84,"location":2,"content":"And, uh, as a- as a- uh, as a- sorry."},{"from":543.84,"to":546.4,"location":2,"content":"I'm- it's a known fact of,"},{"from":546.4,"to":548.34,"location":2,"content":"uh, L2 norms that you have this, um,"},{"from":548.34,"to":550.61,"location":2,"content":"inequality that's the, uh,"},{"from":550.61,"to":552.45,"location":2,"content":"norm of the products of"},{"from":552.45,"to":555.74,"location":2,"content":"some matrices is less and equal to the product of the norms of the matrices."},{"from":555.74,"to":559.71,"location":2,"content":"So, in particular, we're seeing that the norm of this gradient that we're interested in,"},{"from":559.71,"to":561.9,"location":2,"content":"is less than or equal to, uh,"},{"from":561.9,"to":566.69,"location":2,"content":"the product i minus j many times of the norm of the weight matrix Wh."},{"from":566.69,"to":570.41,"location":2,"content":"So, this is what we mean when we say we're concerned about Wh being small,"},{"from":570.41,"to":574.18,"location":2,"content":"because if it's small, then the thing on the left has to be exponentially small."},{"from":574.18,"to":576,"location":2,"content":"So in particular in this,"},{"from":576,"to":577.07,"location":2,"content":"uh, paper that, uh,"},{"from":577.07,"to":580.08,"location":2,"content":"you can take a look at the bottom if you're interested, um, uh,"},{"from":580.08,"to":581.96,"location":2,"content":"Pascanu et al showed that if"},{"from":581.96,"to":586.26,"location":2,"content":"the largest eigenvalue of the weight matrix Wh is less than one,"},{"from":586.26,"to":589.91,"location":2,"content":"then this gradient on the left is going to shrink exponentially."},{"from":589.91,"to":592.38,"location":2,"content":"And you can probably see intuitively why this is true."},{"from":592.38,"to":593.96,"location":2,"content":"So if, you know, as a simplifying assumption,"},{"from":593.96,"to":596.09,"location":2,"content":"we suppose that Wh was not a matrix,"},{"from":596.09,"to":598.76,"location":2,"content":"but simply a scalar that was just a single number,"},{"from":598.76,"to":601.76,"location":2,"content":"then you can see why if that number was greater than one,"},{"from":601.76,"to":603.35,"location":2,"content":"then the whole thing is gonna explode."},{"from":603.35,"to":604.96,"location":2,"content":"And if that number is less than one,"},{"from":604.96,"to":606.3,"location":2,"content":"then it is going to shrink"},{"from":606.3,"to":609.07,"location":2,"content":"exponentially as you multiply by the same number again and again."},{"from":609.07,"to":612.71,"location":2,"content":"Uh, so you can check out the paper for more details,"},{"from":612.71,"to":614.96,"location":2,"content":"but here, uh, the bound is one,"},{"from":614.96,"to":617.73,"location":2,"content":"partially because we have the sigmoid nonlinearity."},{"from":617.73,"to":621.14,"location":2,"content":"And that's, uh, based on the bounds of what we know as the,"},{"from":621.14,"to":624.49,"location":2,"content":"uh, norm of the sigmoid function to be."},{"from":624.49,"to":629.4,"location":2,"content":"So, uh, this shows you why if the, uh,"},{"from":629.4,"to":631.02,"location":2,"content":"Wh matrix is small,"},{"from":631.02,"to":632.6,"location":2,"content":"or if its largest eigenvalue was small,"},{"from":632.6,"to":634.28,"location":2,"content":"then we're going to have vanishing gradients."},{"from":634.28,"to":635.93,"location":2,"content":"And similarly, if you check out the paper,"},{"from":635.93,"to":638.14,"location":2,"content":"you can see that there's a similar proof, uh,"},{"from":638.14,"to":640.98,"location":2,"content":"relating if the largest eigenvalue is greater than one,"},{"from":640.98,"to":643,"location":2,"content":"to having exploding gradients."},{"from":643,"to":645.14,"location":2,"content":"So that's when the gradients get bigger and bigger,"},{"from":645.14,"to":648.27,"location":2,"content":"as you backprop further."},{"from":648.27,"to":652.08,"location":2,"content":"Okay. So hopefully I've convinced you that"},{"from":652.08,"to":655.24,"location":2,"content":"vanishing gradients is a phenomenon that happens in our norms."},{"from":655.24,"to":657.48,"location":2,"content":"But I haven't yet said why this is a problem."},{"from":657.48,"to":660.34,"location":2,"content":"So, why should we view this as a bad thing,"},{"from":660.34,"to":662.51,"location":2,"content":"if the gradients are getting larger and larger,"},{"from":662.51,"to":664.57,"location":2,"content":"or smaller and smaller as you backprop?"},{"from":664.57,"to":668.79,"location":2,"content":"So here's, uh, here's a picture that might illustrate why it's a bad thing."},{"from":668.79,"to":670.5,"location":2,"content":"So, uh, as before,"},{"from":670.5,"to":671.79,"location":2,"content":"suppose that we're thinking about,"},{"from":671.79,"to":674.13,"location":2,"content":"what's the derivative of the loss on"},{"from":674.13,"to":676.74,"location":2,"content":"the fourth step with respect to the first hidden state?"},{"from":676.74,"to":678.27,"location":2,"content":"And we have this situation where"},{"from":678.27,"to":681.56,"location":2,"content":"the gradient is getting smaller and smaller as it goes backwards."},{"from":681.56,"to":684.82,"location":2,"content":"But then, think about what is the gradient of let's say"},{"from":684.82,"to":688.27,"location":2,"content":"the loss in the second step also with respect to the first hidden state."},{"from":688.27,"to":690.75,"location":2,"content":"So I'm representing that with the orange arrows."},{"from":690.75,"to":692.63,"location":2,"content":"And what my point is here,"},{"from":692.63,"to":697.76,"location":2,"content":"is that the magnitude of the gradient signal from close by,"},{"from":697.76,"to":702.17,"location":2,"content":"is a lot bigger than the magnitude of the gradient signal from far away."},{"from":702.17,"to":705.79,"location":2,"content":"And this means that when you update your model weights,"},{"from":705.79,"to":708.52,"location":2,"content":"the signal that you're getting from close by is gonna"},{"from":708.52,"to":710.77,"location":2,"content":"be so much bigger than the signal from far away,"},{"from":710.77,"to":712.59,"location":2,"content":"that essentially you're only going to learn,"},{"from":712.59,"to":714.61,"location":2,"content":"you're only going to optimize with respect to"},{"from":714.61,"to":717.53,"location":2,"content":"these nearby effects and not the long-term effects."},{"from":717.53,"to":722.2,"location":2,"content":"So you're gonna, you're gonna lose the long-term effects, er, inside the,"},{"from":722.2,"to":727.38,"location":2,"content":"the nearby effects. Any questions about this, yeah?"},{"from":727.38,"to":731.49,"location":2,"content":"So, uh, where they say there that you do actual updates."},{"from":731.49,"to":735.24,"location":2,"content":"You know, there are actually some that are multiple chains, not just one chain."},{"from":735.24,"to":737.82,"location":2,"content":"So the nearer term should cover it."},{"from":737.82,"to":739.33,"location":2,"content":"Sorry, what's the last part?"},{"from":739.33,"to":742.4,"location":2,"content":"The nearer term should have a larger effect considering you're"},{"from":742.4,"to":745.87,"location":2,"content":"updating the sum of the weights over different chains."},{"from":745.87,"to":749.03,"location":2,"content":"Okay. So I think, ah, the observation was that,"},{"from":749.03,"to":750.25,"location":2,"content":"given that, for example,"},{"from":750.25,"to":752.8,"location":2,"content":"in Language Modeling you might be summing over multiple losses."},{"from":752.8,"to":755.86,"location":2,"content":"There is a loss in every step and you sum all of them and that's your overall loss."},{"from":755.86,"to":760.6,"location":2,"content":"Then you do want to update more with respect to the nearby losses than the far losses."},{"from":760.6,"to":761.95,"location":2,"content":"So I think, uh, yeah,"},{"from":761.95,"to":763.75,"location":2,"content":"so if the design of your objective function"},{"from":763.75,"to":765.46,"location":2,"content":"is that it's the sum of the loss in every step,"},{"from":765.46,"to":766.76,"location":2,"content":"then you do want to, uh,"},{"from":766.76,"to":768.33,"location":2,"content":"weight all of them equally."},{"from":768.33,"to":770.41,"location":2,"content":"I think, uh, my point was more about,"},{"from":770.41,"to":772.67,"location":2,"content":"what is the influence of, uh,"},{"from":772.67,"to":775.28,"location":2,"content":"the action of the weight matrix at this early stage."},{"from":775.28,"to":777.6,"location":2,"content":"What is its influence on a loss that's nearby?"},{"from":777.6,"to":780.25,"location":2,"content":"And what is its influence on a loss that's far away?"},{"from":780.25,"to":782.88,"location":2,"content":"Um, and due to, uh,"},{"from":782.88,"to":785.29,"location":2,"content":"the dynamics of how the vanishing gradient, uh,"},{"from":785.29,"to":787.42,"location":2,"content":"problem works, then, uh,"},{"from":787.42,"to":789.25,"location":2,"content":"the influence on the loss that's far away"},{"from":789.25,"to":791.18,"location":2,"content":"is gonna be much less than the influence nearby."},{"from":791.18,"to":795.04,"location":2,"content":"And I'm gonna give some more linguistics examples later of why you might want to learn,"},{"from":795.04,"to":797.07,"location":2,"content":"uh, the connections that are farther away."},{"from":797.07,"to":798.19,"location":2,"content":"So essentially the problem is,"},{"from":798.19,"to":800.26,"location":2,"content":"in situations where you do want to learn the connection"},{"from":800.26,"to":803.13,"location":2,"content":"between something that happens early and something that happens later,"},{"from":803.13,"to":805.09,"location":2,"content":"then you're going to be unable to learn that connection."},{"from":805.09,"to":808.33,"location":2,"content":"Uh, so we'll see some motivating examples in a minute."},{"from":808.33,"to":816.13,"location":2,"content":"Any other questions on this? Yeah?"},{"from":816.13,"to":819.97,"location":2,"content":"Um, I'm getting confused like, why are you talking about like dh, dj dh."},{"from":819.97,"to":826.51,"location":2,"content":"Uh, it's like H parameter, like, are we going-"},{"from":826.51,"to":826.84,"location":2,"content":"Yeah."},{"from":826.84,"to":827.02,"location":2,"content":"from-"},{"from":827.02,"to":828.68,"location":2,"content":"Okay. That's a great question."},{"from":828.68,"to":832.51,"location":2,"content":"So you're asking why are we interested in some kind of dj by"},{"from":832.51,"to":836.93,"location":2,"content":"dh given that we're not updating H. H is an activation not a weight."},{"from":836.93,"to":840.52,"location":2,"content":"Um, so the reason why we're thinking about that,"},{"from":840.52,"to":844.33,"location":2,"content":"is because when you think about what is dj by dw,"},{"from":844.33,"to":845.8,"location":2,"content":"which is a thing that we're going to update."},{"from":845.8,"to":850.3,"location":2,"content":"That's always gonna be in terms of dj by dh at some point, right?"},{"from":850.3,"to":852.04,"location":2,"content":"So if we're thinking about W, you know,"},{"from":852.04,"to":853.62,"location":2,"content":"and how it acts on, uh,"},{"from":853.62,"to":855.55,"location":2,"content":"the transmission from h_1 to h_2,"},{"from":855.55,"to":861.36,"location":2,"content":"then dj4 by W in that position is going to have to go through dj4 by dh_2."},{"from":861.36,"to":863.62,"location":2,"content":"So if we're getting vanishing gradients,"},{"from":863.62,"to":865.79,"location":2,"content":"uh, as we back propagate further,"},{"from":865.79,"to":867.19,"location":2,"content":"then it's kind of like a bottleneck."},{"from":867.19,"to":869.74,"location":2,"content":"Then you're certainly going to have vanishing gradients as they affect, uh,"},{"from":869.74,"to":871.54,"location":2,"content":"the recurrence matrix there,"},{"from":871.54,"to":878.88,"location":2,"content":"and indeed the matrix that's applied to the inputs."},{"from":878.88,"to":882.15,"location":2,"content":"Okay. I'm gonna move off now."},{"from":882.15,"to":886.3,"location":2,"content":"Uh, so another way to explain why vanishing gradients is a problem,"},{"from":886.3,"to":889.43,"location":2,"content":"is you can think of it as, uh, a gradient."},{"from":889.43,"to":893.18,"location":2,"content":"You can think of it as a measure of the effect of the past on the future."},{"from":893.18,"to":894.76,"location":2,"content":"So we've already talked about this little bit."},{"from":894.76,"to":897.46,"location":2,"content":"Uh, gradient is like saying, if I change, uh,"},{"from":897.46,"to":899.58,"location":2,"content":"this weight or this activation a little bit,"},{"from":899.58,"to":903.01,"location":2,"content":"then how much and how does it affect this thing in the future."},{"from":903.01,"to":908.65,"location":2,"content":"So in particular, if our gradient is becoming vanishingly small over longer distances,"},{"from":908.65,"to":912.13,"location":2,"content":"let say from step T, step T to step T plus N,"},{"from":912.13,"to":915.79,"location":2,"content":"then we can't tell whether in one of two situations."},{"from":915.79,"to":918.97,"location":2,"content":"So the first situation is maybe there's no dependency between"},{"from":918.97,"to":922.16,"location":2,"content":"step T and step T plus N in the data."},{"from":922.16,"to":924.72,"location":2,"content":"So perhaps we're learning on a task where,"},{"from":924.72,"to":927.19,"location":2,"content":"in the task there truly is no collect, uh,"},{"from":927.19,"to":929.08,"location":2,"content":"connection or relationship to be"},{"from":929.08,"to":931.51,"location":2,"content":"learned between what happens on step T and what happens on"},{"from":931.51,"to":933.58,"location":2,"content":"step T plus N. So there truly is nothing to be"},{"from":933.58,"to":935.98,"location":2,"content":"learned and it's actually correct that there should be,"},{"from":935.98,"to":938.8,"location":2,"content":"you know, small gradients with respect to those two things."},{"from":938.8,"to":941.62,"location":2,"content":"But the second possibility is that, yes,"},{"from":941.62,"to":944.97,"location":2,"content":"that is a true connection between those two things in the data and in the task."},{"from":944.97,"to":947.65,"location":2,"content":"And really ideally we should be learning that connection."},{"from":947.65,"to":952.48,"location":2,"content":"Um, but we have the wrong parameters in our model to capture this thing,"},{"from":952.48,"to":954.12,"location":2,"content":"and therefore that is why the,"},{"from":954.12,"to":955.16,"location":2,"content":"the gradients are small."},{"from":955.16,"to":957.34,"location":2,"content":"Because the model doesn't see them as connected."},{"from":957.34,"to":960.94,"location":2,"content":"So we are not learning the true dependency between these two things."},{"from":960.94,"to":963.97,"location":2,"content":"And the problem with the vanishing gradient problem is that it's,"},{"from":963.97,"to":965.74,"location":2,"content":"we're unable to tell in this situation,"},{"from":965.74,"to":967.79,"location":2,"content":"which of these two situations we're in."},{"from":967.79,"to":969.82,"location":2,"content":"Okay. So this is all pretty theoretical."},{"from":969.82,"to":971.37,"location":2,"content":"I think this example should make it a little more,"},{"from":971.37,"to":974.59,"location":2,"content":"more clear why the vanishing gradient problem is bad."},{"from":974.59,"to":977.7,"location":2,"content":"So, uh, last week we learned about RNN-Language Models."},{"from":977.7,"to":980.68,"location":2,"content":"And if you remember Language Modeling is a task where you have some kind of"},{"from":980.68,"to":984.01,"location":2,"content":"text and then you're trying to predict what word should come next."},{"from":984.01,"to":985.63,"location":2,"content":"So, uh, here's a piece of text."},{"from":985.63,"to":988.42,"location":2,"content":"It says, um, ''When she tried to print her tickets,"},{"from":988.42,"to":990.31,"location":2,"content":"she found that the printer was out of toner."},{"from":990.31,"to":992.66,"location":2,"content":"She went to the stationery store to buy more toner."},{"from":992.66,"to":994.15,"location":2,"content":"It was very overpriced."},{"from":994.15,"to":996.12,"location":2,"content":"After installing the toner into the printer,"},{"from":996.12,"to":998.11,"location":2,"content":"she finally printed her,'' and"},{"from":998.11,"to":1000.52,"location":2,"content":"can someone shout out what word you think should come next?"},{"from":1000.52,"to":1001.39,"location":2,"content":"Tickets."},{"from":1001.39,"to":1002.79,"location":2,"content":"Tickets. Yes, exactly."},{"from":1002.79,"to":1004.92,"location":2,"content":"So that was easy for you to do because, uh,"},{"from":1004.92,"to":1007.27,"location":2,"content":"it makes sense logically that if that was the thing she was trying to do,"},{"from":1007.27,"to":1011.05,"location":2,"content":"that's the thing she's gonna do once she's gone the whole detour for the, for the toner."},{"from":1011.05,"to":1013.83,"location":2,"content":"Um, so the question is,"},{"from":1013.83,"to":1017.4,"location":2,"content":"can RNN-Language Models easily answer this question."},{"from":1017.4,"to":1020.43,"location":2,"content":"Would they do well at this particular Language Modeling example?"},{"from":1020.43,"to":1024.06,"location":2,"content":"So for an RNN-Language Model to do well at this kind of example,"},{"from":1024.06,"to":1027.36,"location":2,"content":"then they need to learn from this kind of example in the Training Data."},{"from":1027.36,"to":1029.74,"location":2,"content":"So if it solves the example in the Training Data,"},{"from":1029.74,"to":1032.47,"location":2,"content":"then the RNN-Language Model will need to model the dependency."},{"from":1032.47,"to":1034.56,"location":2,"content":"Learn the connection between the appearance of"},{"from":1034.56,"to":1037.07,"location":2,"content":"the word tickets early on on the 7th step,"},{"from":1037.07,"to":1040.2,"location":2,"content":"and the target word tickets at the end."},{"from":1040.2,"to":1042.94,"location":2,"content":"But if we have the vanishing gradient problem,"},{"from":1042.94,"to":1045.86,"location":2,"content":"then these gradients, uh, if they know the step,"},{"from":1045.86,"to":1047.95,"location":2,"content":"the, the last step with respect to the early step,"},{"from":1047.95,"to":1049.38,"location":2,"content":"it's gonna be very small because it's,"},{"from":1049.38,"to":1051.12,"location":2,"content":"it's a fairly long distance, right?"},{"from":1051.12,"to":1053.31,"location":2,"content":"And this means that the model is going to be unable to"},{"from":1053.31,"to":1056.04,"location":2,"content":"learn this dependency, easily or at all."},{"from":1056.04,"to":1059.34,"location":2,"content":"So if the model can't learn this kind of dependency during training,"},{"from":1059.34,"to":1061.59,"location":2,"content":"then the model is going to be unable to predict"},{"from":1061.59,"to":1064.85,"location":2,"content":"similar kinds of long distance dependencies at test-time."},{"from":1064.85,"to":1067.55,"location":2,"content":"Okay, here's another example."},{"from":1067.55,"to":1070.2,"location":2,"content":"Um, here's a piece of text."},{"from":1070.2,"to":1072.78,"location":2,"content":"Uh, the text says and this isn't a full sentence."},{"from":1072.78,"to":1074.16,"location":2,"content":"This is just a partial sentence."},{"from":1074.16,"to":1076.93,"location":2,"content":"It says, the writer of the books, blank."},{"from":1076.93,"to":1078.57,"location":2,"content":"And I'm gonna give you two options."},{"from":1078.57,"to":1083.33,"location":2,"content":"It's either, the writer of the books is or the writer of the books are."},{"from":1083.33,"to":1087.48,"location":2,"content":"So, uh, again shout out which one do you think it is, is or are?"},{"from":1087.48,"to":1088.2,"location":2,"content":"Is."},{"from":1088.2,"to":1090.93,"location":2,"content":"Is, that's right. So, uh, the correct answer,"},{"from":1090.93,"to":1093.43,"location":2,"content":"a correct possible continuation of the sentence would be,"},{"from":1093.43,"to":1095.7,"location":2,"content":"uh, the writer of the books is planning a sequel."},{"from":1095.7,"to":1098.91,"location":2,"content":"I can't think of a continuation that goes the writer of the books are,"},{"from":1098.91,"to":1101.91,"location":2,"content":"that would be, uh, grammatically correct."},{"from":1101.91,"to":1104.65,"location":2,"content":"So the reason why I'm bringing up this example,"},{"from":1104.65,"to":1107.49,"location":2,"content":"is because this shows a kind of tension between, uh,"},{"from":1107.49,"to":1108.6,"location":2,"content":"two things called, uh,"},{"from":1108.6,"to":1112.72,"location":2,"content":"syntactic recency and sem- uh, sequential recency."},{"from":1112.72,"to":1116.79,"location":2,"content":"So syntactic recency is the idea that in"},{"from":1116.79,"to":1120.78,"location":2,"content":"order to correctly predict the next word should be more is than are,"},{"from":1120.78,"to":1125.37,"location":2,"content":"is that the word writer is the kind of syntactically close word here."},{"from":1125.37,"to":1128.64,"location":2,"content":"So we say the writer of the books is because it's the writer is."},{"from":1128.64,"to":1131.31,"location":2,"content":"So you can see this as the word writer and is,"},{"from":1131.31,"to":1133.42,"location":2,"content":"are, uh, syntactically close."},{"from":1133.42,"to":1135.97,"location":2,"content":"Because if you looked at the dependency paths for example,"},{"from":1135.97,"to":1139.05,"location":2,"content":"then there would be a short path in that tree."},{"from":1139.05,"to":1145.15,"location":2,"content":"So by contrast, se- sequential recency is the,"},{"from":1145.15,"to":1150.96,"location":2,"content":"uh, simpler concepts of how close words are just in the sentence as a sequence of words."},{"from":1150.96,"to":1152.53,"location":2,"content":"So in this example,"},{"from":1152.53,"to":1155.87,"location":2,"content":"books and are, are very sequentially recent because they're right next to each other."},{"from":1155.87,"to":1158.47,"location":2,"content":"So the reason I'm bringing this up is because,"},{"from":1158.47,"to":1162.66,"location":2,"content":"the second one would be incorrect but it's kind of a tempting option."},{"from":1162.66,"to":1166.59,"location":2,"content":"Because if you're mostly only paying attention to things that happened recently,"},{"from":1166.59,"to":1169.15,"location":2,"content":"um, then you might get distracted and think,"},{"from":1169.15,"to":1171.09,"location":2,"content":"\"Oh, the books are, that sounds right.\""},{"from":1171.09,"to":1175.5,"location":2,"content":"So the problem here is that RNN-Language Models"},{"from":1175.5,"to":1181.17,"location":2,"content":"are better at learning from sequential recency than sicta- syntactic recency."},{"from":1181.17,"to":1182.65,"location":2,"content":"And this is partially due,"},{"from":1182.65,"to":1184.45,"location":2,"content":"due to the vanishing gradient problem."},{"from":1184.45,"to":1187.29,"location":2,"content":"Because especially perhaps, if your syntactically,"},{"from":1187.29,"to":1189.77,"location":2,"content":"uh, related word is actually kind of far away,"},{"from":1189.77,"to":1194.36,"location":2,"content":"then it might get really hard to use the information from the syntactically recent word,"},{"from":1194.36,"to":1198.39,"location":2,"content":"especially if there's a lot of strong signal from the sequentially recent word."},{"from":1198.39,"to":1203.52,"location":2,"content":"So, uh, there are some papers that show that RNN-Language Models make this kind of error,"},{"from":1203.52,"to":1205.2,"location":2,"content":"of saying are, rather than is."},{"from":1205.2,"to":1208.44,"location":2,"content":"Uh, they make this kind of error more often than you would like, uh,"},{"from":1208.44,"to":1211.86,"location":2,"content":"especially if you have multiple of these distracting words such as books, uh,"},{"from":1211.86,"to":1214.38,"location":2,"content":"in between, uh, the word you're trying to predict"},{"from":1214.38,"to":1219.47,"location":2,"content":"and the true word that you should be, uh, referring to."},{"from":1219.47,"to":1227.49,"location":2,"content":"Okay, any questions on this? All right, moving on."},{"from":1227.49,"to":1231.78,"location":2,"content":"So, we briefly mentioned that exploding gradients, uh, is a problem."},{"from":1231.78,"to":1234.96,"location":2,"content":"So, I'm briefly going to justify why is exploding gradients a problem,"},{"from":1234.96,"to":1236.97,"location":2,"content":"and why does it, uh, what does it look like?"},{"from":1236.97,"to":1240.02,"location":2,"content":"[NOISE] So, the reason why exploding gradients are a problem,"},{"from":1240.02,"to":1242.46,"location":2,"content":"is if you remember this is how SGD works."},{"from":1242.46,"to":1244.86,"location":2,"content":"Uh, we say that the new parameters of the model,"},{"from":1244.86,"to":1246.49,"location":2,"content":"which we represent by Theta,"},{"from":1246.49,"to":1248.43,"location":2,"content":"is equal to the old premises,"},{"from":1248.43,"to":1250.62,"location":2,"content":"and then you take some step in the direction of"},{"from":1250.62,"to":1254.04,"location":2,"content":"negative gradients because you're trying to minimize the loss of J."},{"from":1254.04,"to":1258.05,"location":2,"content":"So, the problem is if your gradient gets really big, uh,"},{"from":1258.05,"to":1261.89,"location":2,"content":"then your SGD update step is going to become really big too."},{"from":1261.89,"to":1263.69,"location":2,"content":"So, you're going to be taking a very big step,"},{"from":1263.69,"to":1267.17,"location":2,"content":"and you're going to be drastically changing your model parameters, Theta."},{"from":1267.17,"to":1270.7,"location":2,"content":"And this means that you can end up with some bad updates."},{"from":1270.7,"to":1273.08,"location":2,"content":"We end up taking too large a step."},{"from":1273.08,"to":1275.58,"location":2,"content":"And we're changing the parameters too much."},{"from":1275.58,"to":1276.78,"location":2,"content":"And this means that, uh,"},{"from":1276.78,"to":1278.14,"location":2,"content":"we kind of take a big step,"},{"from":1278.14,"to":1279.84,"location":2,"content":"and we end up in some, uh,"},{"from":1279.84,"to":1281.94,"location":2,"content":"area where the parameters are actually very bad."},{"from":1281.94,"to":1285.45,"location":2,"content":"Uh, with example the- for example,"},{"from":1285.45,"to":1287.81,"location":2,"content":"they might have a much larger loss than they had before."},{"from":1287.81,"to":1289.86,"location":2,"content":"So, in the worst case,"},{"from":1289.86,"to":1292.76,"location":2,"content":"this can often manifest as seeing, uh,"},{"from":1292.76,"to":1297.92,"location":2,"content":"infinities or NaNs, not a number in your network when you're training it in practice."},{"from":1297.92,"to":1301.48,"location":2,"content":"[NOISE] So, this can happen because if you take such a big step"},{"from":1301.48,"to":1305.44,"location":2,"content":"that maybe you update your parameters so much that now they're infinity,"},{"from":1305.44,"to":1307.29,"location":2,"content":"or minus infinity, something like that,"},{"from":1307.29,"to":1310.24,"location":2,"content":"then you're gonna have all of these infinities within your activations as well,"},{"from":1310.24,"to":1312.19,"location":2,"content":"and then all of your losses are going to be infinity,"},{"from":1312.19,"to":1314.38,"location":2,"content":"and the whole thing just isn't going to work, at all."},{"from":1314.38,"to":1316.17,"location":2,"content":"So, it's very annoying when this happens,"},{"from":1316.17,"to":1318.46,"location":2,"content":"and unfortunately it happens, uh, fairly often."},{"from":1318.46,"to":1320.36,"location":2,"content":"And if it does then you have to essentially"},{"from":1320.36,"to":1322.59,"location":2,"content":"restart training from some earlier checkpoint before you"},{"from":1322.59,"to":1324.48,"location":2,"content":"got the NaNs and the infinities because there's"},{"from":1324.48,"to":1326.58,"location":2,"content":"no kind of salvaging it from its new state."},{"from":1326.58,"to":1330.9,"location":2,"content":"[NOISE] So, what's the solution to this exploding gradient problem?"},{"from":1330.9,"to":1333.3,"location":2,"content":"[NOISE] Uh, the solution is actually pretty"},{"from":1333.3,"to":1336.32,"location":2,"content":"simple and it's this technique called gradient clipping."},{"from":1336.32,"to":1338.58,"location":2,"content":"So, the main idea of gradient clipping,"},{"from":1338.58,"to":1341.61,"location":2,"content":"[NOISE] is that if the norm of your gradient is"},{"from":1341.61,"to":1345.6,"location":2,"content":"greater than some threshold and the threshold is a hyperparameter that you choose."},{"from":1345.6,"to":1349.11,"location":2,"content":"uh, then you want to scale down that gradient,"},{"from":1349.11,"to":1352.04,"location":2,"content":"um, before you apply the SGD update."},{"from":1352.04,"to":1355.41,"location":2,"content":"So, the intuition is yo- you're still gonna take a step in the same direction."},{"from":1355.41,"to":1357.03,"location":2,"content":"But you're gonna make sure that it's a smaller step."},{"from":1357.03,"to":1358.95,"location":2,"content":"[NOISE] So, here, um,"},{"from":1358.95,"to":1361.99,"location":2,"content":"I've got a screenshot of some pseudocode from, uh,"},{"from":1361.99,"to":1363.38,"location":2,"content":"the related paper that, uh,"},{"from":1363.38,"to":1366.18,"location":2,"content":"proposed gradient clipping, or at least some version of gradient clipping."},{"from":1366.18,"to":1368.64,"location":2,"content":"[NOISE] And, um, it's pretty simple as you can see."},{"from":1368.64,"to":1371.31,"location":2,"content":"Uh, g hat is the vector which is the, uh,"},{"from":1371.31,"to":1374.47,"location":2,"content":"derivative of the error with respect to the premises,"},{"from":1374.47,"to":1376.77,"location":2,"content":"and it's saying that if the norm of"},{"from":1376.77,"to":1380.07,"location":2,"content":"this gradient is greater than the threshold's, then you just scale it down."},{"from":1380.07,"to":1383.43,"location":2,"content":"But the important thing to note is that it's still pointing in the same direction,"},{"from":1383.43,"to":1386.42,"location":2,"content":"it's just a smaller step."},{"from":1386.42,"to":1390.62,"location":2,"content":"So, here's a picture to show how that might work out in practice."},{"from":1390.62,"to":1393.11,"location":2,"content":"And, uh, this is a diagram from the, uh,"},{"from":1393.11,"to":1396.26,"location":2,"content":"deep learning textbook which is also linked on [NOISE] the website."},{"from":1396.26,"to":1399.62,"location":2,"content":"So, what's going on here, is that, uh,"},{"from":1399.62,"to":1403.2,"location":2,"content":"the picture here is the loss surface of a simple RNN."},{"from":1403.2,"to":1407.25,"location":2,"content":"So, they made a very simple RNN that instead of having, uh,"},{"from":1407.25,"to":1409.28,"location":2,"content":"a sequence of vectors as the hidden states,"},{"from":1409.28,"to":1412.55,"location":2,"content":"it just suppose that each hidden state is simply just a single scalar."},{"from":1412.55,"to":1415.02,"location":2,"content":"So, this means that instead of having a weight matrix, w,"},{"from":1415.02,"to":1416.49,"location":2,"content":"and the bias vector, b,"},{"from":1416.49,"to":1418.61,"location":2,"content":"you have a scalar w and a scalar b."},{"from":1418.61,"to":1422.55,"location":2,"content":"So, that's why in the picture, you just have this like two-dimensional parameter space."},{"from":1422.55,"to":1425.81,"location":2,"content":"And then the, the z-axis is your, is your loss."},{"from":1425.81,"to":1427.52,"location":2,"content":"So here, high loss is,"},{"from":1427.52,"to":1430.34,"location":2,"content":"is bad and low loss is good in what you're trying to get."},{"from":1430.34,"to":1432.83,"location":2,"content":"So, uh, here in this picture,"},{"from":1432.83,"to":1436.89,"location":2,"content":"you've got this kind of cliff, right, where you have this very steep cliff face,"},{"from":1436.89,"to":1439.29,"location":2,"content":"uh, where the loss changes very quickly."},{"from":1439.29,"to":1443.71,"location":2,"content":"[NOISE] And this cliff is really dangerous because it has steep, steep gradients."},{"from":1443.71,"to":1446.16,"location":2,"content":"And you might be in danger of taking a really big,"},{"from":1446.16,"to":1449.97,"location":2,"content":"[NOISE] uh, update step because you're on the area with a really steep gradient."},{"from":1449.97,"to":1452.51,"location":2,"content":"[NOISE] So, on the left,"},{"from":1452.51,"to":1457.41,"location":2,"content":"you've got a possible scenario of what might happen if you don't have gradient clipping."},{"from":1457.41,"to":1459.33,"location":2,"content":"[NOISE] So, on the left, uh,"},{"from":1459.33,"to":1462.62,"location":2,"content":"you can see that you start kind of at the bottom of the cliff,"},{"from":1462.62,"to":1465.46,"location":2,"content":"and you have a f- a si- a few small updates."},{"from":1465.46,"to":1468.15,"location":2,"content":"And then, in particular makes a bad update because you"},{"from":1468.15,"to":1470.76,"location":2,"content":"see there's a small kind of dip before it goes off the cliff."},{"from":1470.76,"to":1472.59,"location":2,"content":"So, th- the true local minimum,"},{"from":1472.59,"to":1476.14,"location":2,"content":"the optimal you're trying to get to is that the bottom of that small kind of ditch."},{"from":1476.14,"to":1480.38,"location":2,"content":"And, um, it starts off kind of near the edge of that ditch,"},{"from":1480.38,"to":1482.67,"location":2,"content":"and then there's a negative gradient going into it."},{"from":1482.67,"to":1485.78,"location":2,"content":"But unfortunately, the, the update kind of overshoots,"},{"from":1485.78,"to":1487.79,"location":2,"content":"and it ends up going a long way off the cliff."},{"from":1487.79,"to":1490.47,"location":2,"content":"So now, it's in this bad situation where it's taken a bad update,"},{"from":1490.47,"to":1492.93,"location":2,"content":"and now it's got a much bigger loss than it had [NOISE] before."},{"from":1492.93,"to":1494.65,"location":2,"content":"So now that it's on the cliff."},{"from":1494.65,"to":1496.48,"location":2,"content":"Again it, it measures the gradient,"},{"from":1496.48,"to":1498.18,"location":2,"content":"and the gradient is very steep, right?"},{"from":1498.18,"to":1499.42,"location":2,"content":"The gradient is very large."},{"from":1499.42,"to":1501.36,"location":2,"content":"So, when it takes a, uh,"},{"from":1501.36,"to":1503.12,"location":2,"content":"update with respect to that gradient,"},{"from":1503.12,"to":1504.3,"location":2,"content":"then because the gradient is so big,"},{"from":1504.3,"to":1505.9,"location":2,"content":"it takes a really huge step."},{"from":1505.9,"to":1508.02,"location":2,"content":"And that's, um, the, the one to the right."},{"from":1508.02,"to":1509.58,"location":2,"content":"You can see the step going to the right."},{"from":1509.58,"to":1512.19,"location":2,"content":"So, that's also a very bad update because it's just throwing"},{"from":1512.19,"to":1515.31,"location":2,"content":"it really far to some probably fairly random,"},{"from":1515.31,"to":1517.63,"location":2,"content":"uh, configuration of w and b."},{"from":1517.63,"to":1520.74,"location":2,"content":"So, on the left, you can see what can go wrong if you're taking"},{"from":1520.74,"to":1524.83,"location":2,"content":"these really big steps because you were in areas with a very steep gradient."},{"from":1524.83,"to":1526.52,"location":2,"content":"So, by contrast on the right,"},{"from":1526.52,"to":1529.61,"location":2,"content":"you can see what might happen if you do have a gradient clipping."},{"from":1529.61,"to":1532.55,"location":2,"content":"[NOISE] [NOISE] And, um, it's much less drastic, right?"},{"from":1532.55,"to":1535.78,"location":2,"content":"You've got a similar kind of pattern where it takes a few steps into the ditch,"},{"from":1535.78,"to":1537.74,"location":2,"content":"and then ends up going off the cliff a little bit,"},{"from":1537.74,"to":1539.84,"location":2,"content":"but not too much because the gradient was clipped."},{"from":1539.84,"to":1542.4,"location":2,"content":"And then, it's on the cliff and there's again a really steep gradient,"},{"from":1542.4,"to":1545.49,"location":2,"content":"but it doesn't take such a big step because again the gradient was clipped,"},{"from":1545.49,"to":1547.1,"location":2,"content":"so that it kind of comes back down."},{"from":1547.1,"to":1551.09,"location":2,"content":"So, you can see that plausibly by using this gradient clipping method,"},{"from":1551.09,"to":1553.21,"location":2,"content":"you've got a, a kind of safer update rule,"},{"from":1553.21,"to":1554.31,"location":2,"content":"where you're not gonna take any,"},{"from":1554.31,"to":1557.36,"location":2,"content":"any big crazy steps and you're more likely to kind of find the,"},{"from":1557.36,"to":1559.59,"location":2,"content":"the true minimum which is at the bottom of the ditch."},{"from":1559.59,"to":1562.24,"location":2,"content":"[NOISE] I think there was a question earlier."},{"from":1562.24,"to":1563.94,"location":2,"content":"Was there a question over here? [NOISE]"},{"from":1563.94,"to":1565.38,"location":2,"content":"I just want to see the value. [NOISE] [NOISE]"},{"from":1565.38,"to":1567.87,"location":2,"content":"Okay. Anyone else?"},{"from":1567.87,"to":1569.46,"location":2,"content":"[NOISE]"},{"from":1569.46,"to":1581.19,"location":2,"content":"Yeah?"},{"from":1581.19,"to":1581.46,"location":2,"content":"[NOISE] [inaudible]"},{"from":1581.46,"to":1583.01,"location":2,"content":"So, the question is, in assignment three,"},{"from":1583.01,"to":1586.32,"location":2,"content":"y- you saw the atom optimization algorithm which, uh,"},{"from":1586.32,"to":1587.92,"location":2,"content":"has this thing called momentum,"},{"from":1587.92,"to":1591.02,"location":2,"content":"which essentially says that kind of like physical momentum in,"},{"from":1591.02,"to":1595.76,"location":2,"content":"in the real world, that if you've been traveling in the same direction for a while,"},{"from":1595.76,"to":1599.19,"location":2,"content":"then you can take bigger steps,"},{"from":1599.19,"to":1601.32,"location":2,"content":"I think, and if you've recently kind of changed direction,"},{"from":1601.32,"to":1602.84,"location":2,"content":"then you should take smaller steps."},{"from":1602.84,"to":1607.62,"location":2,"content":"And I think there's another element as well, where you divide by some factor."},{"from":1607.62,"to":1609.6,"location":2,"content":"[NOISE] So, it is a similar kind of idea."},{"from":1609.6,"to":1611.19,"location":2,"content":"I suppose it's a different criterion, right?"},{"from":1611.19,"to":1614.43,"location":2,"content":"So, what they both have in common is it's a kind of criterion for when to"},{"from":1614.43,"to":1618.05,"location":2,"content":"scale up or scale down the size of your update step."},{"from":1618.05,"to":1620.19,"location":2,"content":"Um, and I think they're based on different notions"},{"from":1620.19,"to":1623.16,"location":2,"content":"of when should you take bigger steps and when should you take smaller steps."},{"from":1623.16,"to":1625.02,"location":2,"content":"When should you be cautious or less cautious?"},{"from":1625.02,"to":1627.51,"location":2,"content":"So, I guess here the criterion is different."},{"from":1627.51,"to":1629.88,"location":2,"content":"It's kind of a simple criterion saying, like if it's really steep,"},{"from":1629.88,"to":1646.73,"location":2,"content":"then be careful. Yeah. Another question?"},{"from":1646.73,"to":1651.88,"location":2,"content":"Uh, so the [inaudible]. [NOISE]"},{"from":1651.88,"to":1654.06,"location":2,"content":"Okay. So the question is,"},{"from":1654.06,"to":1657.15,"location":2,"content":"is this similar to regularization of some kind, right?"},{"from":1657.15,"to":1659.46,"location":2,"content":"So, I suppose, yeah, there is- there are some things in common."},{"from":1659.46,"to":1663.69,"location":2,"content":"Say for, example, L2 regularization says that you want, for example,"},{"from":1663.69,"to":1669.4,"location":2,"content":"your weight matrices to have a small L2 norm, right?"},{"from":1669.4,"to":1671.34,"location":2,"content":"And the idea is that you're trying to prevent"},{"from":1671.34,"to":1673.92,"location":2,"content":"your model from over-fitting the data by, um,"},{"from":1673.92,"to":1677.19,"location":2,"content":"having some kind of constraint that says you have to keep your weights fairly simple,"},{"from":1677.19,"to":1679.14,"location":2,"content":"that is keep them, you know, small."},{"from":1679.14,"to":1681.39,"location":2,"content":"So, I suppose the relationship is that here we're"},{"from":1681.39,"to":1683.49,"location":2,"content":"saying that we don't want the norm of the gradients to be too big."},{"from":1683.49,"to":1687.5,"location":2,"content":"Ah, I don't know if this is related to overfitting."},{"from":1687.5,"to":1689.49,"location":2,"content":"Um, I guess I have to think more carefully about that,"},{"from":1689.49,"to":1694.17,"location":2,"content":"but I guess it's a similar kind of constraint that you're placing."},{"from":1694.17,"to":1696.44,"location":2,"content":"Okay. I'm gonna move on for now."},{"from":1696.44,"to":1699.66,"location":2,"content":"Uh, so we've talked"},{"from":1699.66,"to":1703.08,"location":2,"content":"about how you might fix the exploding gradient problem with gradient clipping,"},{"from":1703.08,"to":1706.74,"location":2,"content":"but we haven't talked about how we might fix the vanishing gradient problem."},{"from":1706.74,"to":1709.41,"location":2,"content":"So, um, to recap,"},{"from":1709.41,"to":1714.27,"location":2,"content":"I think one way to characterize the problem with the- the vanishing gradients in RNNs is"},{"from":1714.27,"to":1719.62,"location":2,"content":"that it's too difficult for the RNN to learn to preserve information over many timesteps."},{"from":1719.62,"to":1721.34,"location":2,"content":"So, in our example with printing"},{"from":1721.34,"to":1724.47,"location":2,"content":"the tickets and re- remembering that it's the tickets that she wants to print,"},{"from":1724.47,"to":1728.42,"location":2,"content":"you could think of it as it's hard for the RNN language model to correctly"},{"from":1728.42,"to":1732.26,"location":2,"content":"predict tickets because in a way, it's too hard for the RNN language model to,"},{"from":1732.26,"to":1736.35,"location":2,"content":"uh, learn to retain the tickets information and use it later."},{"from":1736.35,"to":1738.9,"location":2,"content":"So, um, if you look at the equation"},{"from":1738.9,"to":1741.63,"location":2,"content":"for vanilla RNNs and how we compute the hidden state, uh,"},{"from":1741.63,"to":1743.95,"location":2,"content":"based on the previous hidden state and- and the inputs,"},{"from":1743.95,"to":1747.49,"location":2,"content":"you can see that the hidden state is in a way constantly being rewritten."},{"from":1747.49,"to":1749.91,"location":2,"content":"It's always computed based on these, uh,"},{"from":1749.91,"to":1751.65,"location":2,"content":"linear transformations and the,"},{"from":1751.65,"to":1753.11,"location":2,"content":"you know, the non-linearity."},{"from":1753.11,"to":1755.01,"location":2,"content":"So, it's not all that easy to"},{"from":1755.01,"to":1757.96,"location":2,"content":"preserve the information from one hidden state to the other,"},{"from":1757.96,"to":1761.14,"location":2,"content":"in particular, because we are putting it through this non-linearity function."},{"from":1761.14,"to":1766.98,"location":2,"content":"So, this motivates us to ask what about an RNN with some kind of separate memory?"},{"from":1766.98,"to":1771.51,"location":2,"content":"If we have some kind of separate place to store information that we want to use later,"},{"from":1771.51,"to":1774.63,"location":2,"content":"then would this make it easier for our RNN"},{"from":1774.63,"to":1778.29,"location":2,"content":"to learn to preserve information over many timesteps?"},{"from":1778.29,"to":1785.2,"location":2,"content":"So, this is the motivating idea behind LSTMs or Long Short-Term Memory RNNs."},{"from":1785.2,"to":1791.55,"location":2,"content":"So, the idea here is that an LSTM is a type of RNN and it was proposed back in, uh, 1997."},{"from":1791.55,"to":1793.34,"location":2,"content":"And the idea is that this is, uh,"},{"from":1793.34,"to":1796.8,"location":2,"content":"this was proposed as an explicit solution to the vanishing gradients problem."},{"from":1796.8,"to":1800.28,"location":2,"content":"[NOISE] So, one of the main differences here is"},{"from":1800.28,"to":1803.88,"location":2,"content":"that on each step T instead of just having a hidden state h_t,"},{"from":1803.88,"to":1808.32,"location":2,"content":"we have both the hidden state h_t and the cell state which we denote c_t."},{"from":1808.32,"to":1812.09,"location":2,"content":"And both of these are vectors of some same length,"},{"from":1812.09,"to":1815.43,"location":2,"content":"n, and the idea there is that the cell is meant to"},{"from":1815.43,"to":1820.11,"location":2,"content":"sto- store our long-term information that, that's on memory units."},{"from":1820.11,"to":1823.61,"location":2,"content":"Another super important thing is that the LSTM can"},{"from":1823.61,"to":1826.98,"location":2,"content":"erase and write [NOISE] and read information from the cell."},{"from":1826.98,"to":1829.99,"location":2,"content":"So, you kind of think of this a bit like memory in a computer,"},{"from":1829.99,"to":1833.71,"location":2,"content":"in that you can do these operations, reading and writing and erasing,"},{"from":1833.71,"to":1837.09,"location":2,"content":"um, and that's how you're gonna keep your information."},{"from":1837.09,"to":1839.22,"location":2,"content":"[NOISE]."},{"from":1839.22,"to":1843.25,"location":2,"content":"Another super important thing is that the way the LSTM decides,"},{"from":1843.25,"to":1845.49,"location":2,"content":"whether it wants to erase, write, read,"},{"from":1845.49,"to":1848.57,"location":2,"content":"information and decide how much and which information,"},{"from":1848.57,"to":1851.43,"location":2,"content":"uh, that's all controlled by these [NOISE] gates."},{"from":1851.43,"to":1856.56,"location":2,"content":"So, the idea is [NOISE] that the gates are themselves also vectors of length n,"},{"from":1856.56,"to":1859.59,"location":2,"content":"and the idea there is that on each timestep,"},{"from":1859.59,"to":1864.96,"location":2,"content":"each element of these gates which are vectors are somewhere between zero and one."},{"from":1864.96,"to":1870.16,"location":2,"content":"So here, uh, one represents an open gate and zero represents a closed gate,"},{"from":1870.16,"to":1872.71,"location":2,"content":"and you can have values anywhere in between."},{"from":1872.71,"to":1875.47,"location":2,"content":"So, the overall idea, which we're gonna firm up on the next slide,"},{"from":1875.47,"to":1877.77,"location":2,"content":"but the overall idea is that if the gate is open,"},{"from":1877.77,"to":1880.59,"location":2,"content":"that represents some kind of information being passed through,"},{"from":1880.59,"to":1881.67,"location":2,"content":"and if the gate is closed,"},{"from":1881.67,"to":1884.36,"location":2,"content":"it [NOISE] means that information does not pass through."},{"from":1884.36,"to":1888.7,"location":2,"content":"Okay. So, the last really important thing is that the gates are dynamic."},{"from":1888.7,"to":1892.95,"location":2,"content":"They're not just set at some constant value for the whole sequence."},{"from":1892.95,"to":1894.33,"location":2,"content":"[NOISE] Um, they're dynamic,"},{"from":1894.33,"to":1896.79,"location":2,"content":"which means that they're different on each timestep T,"},{"from":1896.79,"to":1901.2,"location":2,"content":"and the value that is the decision of whether they're open or closed and in which ways,"},{"from":1901.2,"to":1905.1,"location":2,"content":"[NOISE] um, that is computed based on the current context."},{"from":1905.1,"to":1906.94,"location":2,"content":"Okay. So here's, um,"},{"from":1906.94,"to":1910.13,"location":2,"content":"here's the- the equations for the LSTM which might make it clearer."},{"from":1910.13,"to":1914.16,"location":2,"content":"So, uh, suppose we have some sequence of i- inputs x_t and we"},{"from":1914.16,"to":1918.43,"location":2,"content":"want to compute a sequence of hidden state h_t and cell states c_t."},{"from":1918.43,"to":1922.74,"location":2,"content":"So, this is what happens on timestep t. Uh,"},{"from":1922.74,"to":1927.21,"location":2,"content":"this process equation shows you the three gates that I talked about before."},{"from":1927.21,"to":1929.91,"location":2,"content":"So, the first one is called the Forget Gates."},{"from":1929.91,"to":1934.55,"location":2,"content":"And the idea is that this one is controlling what is kept versus what is forgotten,"},{"from":1934.55,"to":1938.13,"location":2,"content":"um, from the previous cell state, the previous memory."},{"from":1938.13,"to":1942.51,"location":2,"content":"And you can see that this forget gate is computed based on, uh,"},{"from":1942.51,"to":1946.46,"location":2,"content":"the previous hidden state h_t minus one and the current input x_t."},{"from":1946.46,"to":1949.11,"location":2,"content":"Um, so that's what I meant when I said that it's"},{"from":1949.11,"to":1951.93,"location":2,"content":"dynamic and it's computed based on the- the current context."},{"from":1951.93,"to":1956.19,"location":2,"content":"[NOISE] Um, you can also see that it's computed using,"},{"from":1956.19,"to":1957.39,"location":2,"content":"uh, the sigmoid function,"},{"from":1957.39,"to":1959.92,"location":2,"content":"which means that it is somewhere between zero and one."},{"from":1959.92,"to":1963.05,"location":2,"content":"Okay. The next gate is called the input gate,"},{"from":1963.05,"to":1968.64,"location":2,"content":"and this one controls what parts of the new cell contents are written to the cell."},{"from":1968.64,"to":1972.02,"location":2,"content":"So, the idea there is that you have this- this memory cell and this is kind of, um,"},{"from":1972.02,"to":1977.23,"location":2,"content":"controlling like ho- how and what you get to write to the memory cell."},{"from":1977.23,"to":1979.99,"location":2,"content":"Okay. And the last one is called the upper gate."},{"from":1979.99,"to":1981.78,"location":2,"content":"So, this one is controlling, uh,"},{"from":1981.78,"to":1984.93,"location":2,"content":"what parts of the cell are outputs to the hidden state,"},{"from":1984.93,"to":1988.62,"location":2,"content":"[NOISE] so you could view this as kind of like the read function, right?"},{"from":1988.62,"to":1990.09,"location":2,"content":"We're going to read some information from"},{"from":1990.09,"to":1992.68,"location":2,"content":"our memory cell and that's gonna get put into our hidden states,"},{"from":1992.68,"to":1994.05,"location":2,"content":"and this gate is gonna control that."},{"from":1994.05,"to":1996.99,"location":2,"content":"[NOISE] Okay."},{"from":1996.99,"to":2002.06,"location":2,"content":"[NOISE] Uh, yeah, that's just the sigmoid function as we noted before."},{"from":2002.06,"to":2005.87,"location":2,"content":"All right. So, the next set of equation shows how we use these gates."},{"from":2005.87,"to":2008.12,"location":2,"content":"[NOISE] So, the first line, uh,"},{"from":2008.12,"to":2009.34,"location":2,"content":"you could regard this, uh,"},{"from":2009.34,"to":2012.1,"location":2,"content":"c_tilde as the new [NOISE] cell content."},{"from":2012.1,"to":2014.96,"location":2,"content":"So, uh, this is the new content that you want to write to the cell,"},{"from":2014.96,"to":2017.54,"location":2,"content":"[NOISE] and this is also computed based on, uh,"},{"from":2017.54,"to":2019.49,"location":2,"content":"your previous hidden state and your current inputs,"},{"from":2019.49,"to":2021.92,"location":2,"content":"and this goes through your tan h non-linearity."},{"from":2021.92,"to":2025.25,"location":2,"content":"So, uh, this is kind of the- the main contents that"},{"from":2025.25,"to":2029.57,"location":2,"content":"you are computing based on the context and you want to write this into memory."},{"from":2029.57,"to":2034.55,"location":2,"content":"So, on the next line what's happening is that we're going to use"},{"from":2034.55,"to":2040.07,"location":2,"content":"the forget gate to selectively forget some of the information from the previous,"},{"from":2040.07,"to":2041.93,"location":2,"content":"[NOISE] uh, memory cell."},{"from":2041.93,"to":2044.78,"location":2,"content":"And you can see that we're doing these element-wise products,"},{"from":2044.78,"to":2046.34,"location":2,"content":"that's what the little circle is."},{"from":2046.34,"to":2048.95,"location":2,"content":"So, the idea is that if you remember that f_t is"},{"from":2048.95,"to":2051.98,"location":2,"content":"a vector full of values between zero and one,"},{"from":2051.98,"to":2054.38,"location":2,"content":"when you do an element-wise product between f_t and"},{"from":2054.38,"to":2056.89,"location":2,"content":"the previous cell state c_t minus one,"},{"from":2056.89,"to":2059,"location":2,"content":"then what you're essentially doing is you're kind of masking"},{"from":2059,"to":2061.86,"location":2,"content":"out some of the information from the previous hidden state."},{"from":2061.86,"to":2063.68,"location":2,"content":"Sorry, no. Previous cell state."},{"from":2063.68,"to":2066.16,"location":2,"content":"So, when f is one,"},{"from":2066.16,"to":2067.8,"location":2,"content":"then you're copying over the information,"},{"from":2067.8,"to":2070.34,"location":2,"content":"but when f is zero, then you're getting rid of that information,"},{"from":2070.34,"to":2073.93,"location":2,"content":"you are erasing it or forgetting it."},{"from":2073.93,"to":2077.03,"location":2,"content":"Okay. And then the other half of this equation,"},{"from":2077.03,"to":2079.55,"location":2,"content":"um, i_t times c tilde t, uh,"},{"from":2079.55,"to":2081.5,"location":2,"content":"that's the input gate controlling"},{"from":2081.5,"to":2084.22,"location":2,"content":"which parts of the new cell contents are gonna get written,"},{"from":2084.22,"to":2087.13,"location":2,"content":"written to the, to the cell."},{"from":2087.13,"to":2090.09,"location":2,"content":"Okay. And then the last thing we do is we, uh,"},{"from":2090.09,"to":2092.9,"location":2,"content":"pass the cell through a tan h,"},{"from":2092.9,"to":2095.39,"location":2,"content":"that's just adding another non-linearity,"},{"from":2095.39,"to":2096.53,"location":2,"content":"and then you pass that through"},{"from":2096.53,"to":2099.38,"location":2,"content":"the output gates and that gives you [NOISE] the hidden state."},{"from":2099.38,"to":2102.95,"location":2,"content":"So, in LSTMs, we often think of the hidden states as being,"},{"from":2102.95,"to":2105.09,"location":2,"content":"uh, like the outputs of the RNN."},{"from":2105.09,"to":2107.57,"location":2,"content":"And the reason for this is that you kind of view"},{"from":2107.57,"to":2109.91,"location":2,"content":"the cell states as being this kind of"},{"from":2109.91,"to":2112.99,"location":2,"content":"internal memory that's not generally accessible to the outside,"},{"from":2112.99,"to":2115.28,"location":2,"content":"but the hidden states are the parts that you're"},{"from":2115.28,"to":2117.91,"location":2,"content":"gonna pa- pass on to the next part of the model."},{"from":2117.91,"to":2120.62,"location":2,"content":"So, that's why we view it as kind of like the output of the model."},{"from":2120.62,"to":2124.52,"location":2,"content":"[NOISE] Uh, and this is, yeah,"},{"from":2124.52,"to":2126.41,"location":2,"content":"x just to remind the- there is- circles are"},{"from":2126.41,"to":2129.09,"location":2,"content":"element-wise products and that's how we apply the gates."},{"from":2129.09,"to":2130.91,"location":2,"content":"Uh, did anyone have any questions about this?"},{"from":2130.91,"to":2140.99,"location":2,"content":"[NOISE]."},{"from":2140.99,"to":2143.96,"location":2,"content":"Okay. [NOISE] Um, so as a reminder,"},{"from":2143.96,"to":2146.42,"location":2,"content":"all of these are vectors of some same length n."},{"from":2146.42,"to":2149.51,"location":2,"content":"[NOISE] Okay."},{"from":2149.51,"to":2152.91,"location":2,"content":"So, some people learn better from diagrams than equations,"},{"from":2152.91,"to":2155.66,"location":2,"content":"and here's a diagram presentation of the same idea."},{"from":2155.66,"to":2157.89,"location":2,"content":"So, this is a really nice diagram from a blog post,"},{"from":2157.89,"to":2159.66,"location":2,"content":"uh, by Chris Olah about LSTMs,"},{"from":2159.66,"to":2161.42,"location":2,"content":"and that was a good place to start if you want to"},{"from":2161.42,"to":2164.45,"location":2,"content":"get an intuitive understanding of what LSTMs are."},{"from":2164.45,"to":2166.51,"location":2,"content":"So, in this diagram, uh,"},{"from":2166.51,"to":2169.05,"location":2,"content":"the green boxes represent timesteps,"},{"from":2169.05,"to":2172.55,"location":2,"content":"um, and let's zoom in on the middle one and see what's happening here."},{"from":2172.55,"to":2174.86,"location":2,"content":"So, within one timestep,"},{"from":2174.86,"to":2177.65,"location":2,"content":"you can see that this diagram is showing exactly the same thing as"},{"from":2177.65,"to":2180.76,"location":2,"content":"those six equations showed on the previous slide."},{"from":2180.76,"to":2185.81,"location":2,"content":"So, uh, the first thing we do is we use the, uh, the current input x_t,"},{"from":2185.81,"to":2189.35,"location":2,"content":"which is at the bottom and the previous hidden state h_t minus the one on the left,"},{"from":2189.35,"to":2191.42,"location":2,"content":"and we can use that to compute the forget gate."},{"from":2191.42,"to":2194.72,"location":2,"content":"[NOISE] And you can see f_t is on that arrow there."},{"from":2194.72,"to":2199.39,"location":2,"content":"And then you apply the forget gate to the previous, uh, cell,"},{"from":2199.39,"to":2202.97,"location":2,"content":"and that's the same thing as forgetting some of the- the cell content from last time."},{"from":2202.97,"to":2204.7,"location":2,"content":"[NOISE] Okay."},{"from":2204.7,"to":2207.29,"location":2,"content":"And then after that, you can compute the input gate, uh,"},{"from":2207.29,"to":2210.16,"location":2,"content":"and that's computed in much the same way as the forget gate."},{"from":2210.16,"to":2215.24,"location":2,"content":"And then you use the input gate to decide which parts of this,"},{"from":2215.24,"to":2218.76,"location":2,"content":"uh, new cell content get written to the cell,"},{"from":2218.76,"to":2220.57,"location":2,"content":"and that gives you the cell c_t."},{"from":2220.57,"to":2224.24,"location":2,"content":"So, here you can see that you computed the impu ga- input gates and"},{"from":2224.24,"to":2228.77,"location":2,"content":"the new content and then you use that to gate that and write it to the cell."},{"from":2228.77,"to":2230.6,"location":2,"content":"So, now we've got our new cell c_t,"},{"from":2230.6,"to":2235.37,"location":2,"content":"and then the last things we need to do is to compute our new output gate, that's o_t."},{"from":2235.37,"to":2239.78,"location":2,"content":"And then lastly, use the output gate to select which parts of"},{"from":2239.78,"to":2245.09,"location":2,"content":"the cell contents you're gonna read and put in the new hidden state h_t."},{"from":2245.09,"to":2247.31,"location":2,"content":"So, that's, that's, uh, that's"},{"from":2247.31,"to":2252.4,"location":2,"content":"the same thing as the equations we saw on the previous slide."},{"from":2252.4,"to":2255.03,"location":2,"content":"Okay. So, that's LSTMs."},{"from":2255.03,"to":2257.51,"location":2,"content":"Um, is there a question?"},{"from":2257.51,"to":2270.11,"location":2,"content":"What's the importance [NOISE] [inaudible]"},{"from":2270.11,"to":2272.84,"location":2,"content":"The question is, why are we applying a tan h"},{"from":2272.84,"to":2275.72,"location":2,"content":"on the very last equation on this, on this slide?"},{"from":2275.72,"to":2281.12,"location":2,"content":"Why we're planning a tan h to the cell before applying the output gate?"},{"from":2281.12,"to":2285.8,"location":2,"content":"Let's see. Um."},{"from":2285.8,"to":2290.33,"location":2,"content":"Yeah. So, your question is, the- the cell,"},{"from":2290.33,"to":2299.33,"location":2,"content":"the new cell content already went through a tan h. Um, I'm not sure."},{"from":2299.33,"to":2301.58,"location":2,"content":"So, I suppose a- a- a general answer is that it must"},{"from":2301.58,"to":2303.98,"location":2,"content":"be giving some kind of more expressivity in some way,"},{"from":2303.98,"to":2306.41,"location":2,"content":"and that it's not just applying"},{"from":2306.41,"to":2311.42,"location":2,"content":"tan h's sequentially because you do have the gates in between."},{"from":2311.42,"to":2314.24,"location":2,"content":"Um, so I suppose there must be a reason,"},{"from":2314.24,"to":2316.45,"location":2,"content":"kind of similarly to when you apply- apply"},{"from":2316.45,"to":2319.64,"location":2,"content":"a linear layer you won't have a non-linearity before the next linear layer."},{"from":2319.64,"to":2323.07,"location":2,"content":"I suppose maybe we're viewing these cases as a kind of linear layer?"},{"from":2323.07,"to":2324.92,"location":2,"content":"I'm not sure. I'll look it up."},{"from":2324.92,"to":2329.45,"location":2,"content":"[NOISE] Okay."},{"from":2329.45,"to":2332.07,"location":2,"content":"So, uh, that's LSTMs."},{"from":2332.07,"to":2334.22,"location":2,"content":"And, um, re- if you recall,"},{"from":2334.22,"to":2335.8,"location":2,"content":"we were- oh, question?"},{"from":2335.8,"to":2338.61,"location":2,"content":"Yeah. Why is it that in the forget gate,"},{"from":2338.61,"to":2342.18,"location":2,"content":"you don't look at the previous cell state but you just look at the new hidden state?"},{"from":2342.18,"to":2344.18,"location":2,"content":"Like it seems like if you're this- instead of"},{"from":2344.18,"to":2347.09,"location":2,"content":"deciding what to forget from the cell state, you should look at it."},{"from":2347.09,"to":2349.67,"location":2,"content":"So the question is, why is the forget gate"},{"from":2349.67,"to":2352.61,"location":2,"content":"computed only for the previous hidden state and the current input,"},{"from":2352.61,"to":2356.84,"location":2,"content":"why is it not computed based on ct minus one itself, right?"},{"from":2356.84,"to":2358.82,"location":2,"content":"Because surely you want to look at the thing to figure"},{"from":2358.82,"to":2361.88,"location":2,"content":"out whether you want to forget it or not?"},{"from":2361.88,"to":2364.3,"location":2,"content":"Um, that's a pretty good question."},{"from":2364.3,"to":2369.74,"location":2,"content":"Uh, so, I suppose one reason why you might think that this- this works fine is that"},{"from":2369.74,"to":2373.04,"location":2,"content":"the LSTM might be learning a general algorithm"},{"from":2373.04,"to":2376.73,"location":2,"content":"for where it stores different types of information in the cell, right?"},{"from":2376.73,"to":2379.37,"location":2,"content":"So, maybe it's learning that in this particular position in the cell,"},{"from":2379.37,"to":2384.23,"location":2,"content":"I learn information about this particular semantic thing and then in this situation,"},{"from":2384.23,"to":2388.3,"location":2,"content":"I want to use that or not use that, forget it or keep it."},{"from":2388.3,"to":2391.58,"location":2,"content":"But, yeah, I haven't entirely convinced myself why you don't want to"},{"from":2391.58,"to":2395.09,"location":2,"content":"look at the contents of the cell itself in order to decide."},{"from":2395.09,"to":2402.53,"location":2,"content":"I suppose another thing to notice is that ht minus one was read from ct minus one."},{"from":2402.53,"to":2406.01,"location":2,"content":"So, I suppose there is some information there but not necessarily all of the information."},{"from":2406.01,"to":2411.09,"location":2,"content":"Ah, yeah."},{"from":2411.09,"to":2413.42,"location":2,"content":"I'm not sure, that's another thing I need to look up I guess."},{"from":2413.42,"to":2421.06,"location":2,"content":"[NOISE] Any other questions?"},{"from":2421.06,"to":2427.78,"location":2,"content":"Okay. Ah, so, that's LSTMs and,"},{"from":2427.78,"to":2431.82,"location":2,"content":"um, LSTMs were introduced to try to solve the vanishing gradient problem."},{"from":2431.82,"to":2433.64,"location":2,"content":"So, the question is, ah,"},{"from":2433.64,"to":2437.84,"location":2,"content":"how exactly is this architecture making the vanishing gradient problem any better?"},{"from":2437.84,"to":2441.71,"location":2,"content":"So, you could, ah, see that the LSTM architecture"},{"from":2441.71,"to":2445.93,"location":2,"content":"actually makes it easier for RNNs to preserve information over many time steps."},{"from":2445.93,"to":2448.34,"location":2,"content":"So, while it w as kind of difficult for"},{"from":2448.34,"to":2452.28,"location":2,"content":"the vanilla RNN to preserve the information over all of the hidden states,"},{"from":2452.28,"to":2454.73,"location":2,"content":"there's actually a fairly easy strategy that makes"},{"from":2454.73,"to":2457.26,"location":2,"content":"it simple for the LSTM to preserve the information."},{"from":2457.26,"to":2462.29,"location":2,"content":"So, namely, if the forget gate is set to remember everything on every step, um,"},{"from":2462.29,"to":2465.29,"location":2,"content":"that's a fairly simple strategy that will ensure that"},{"from":2465.29,"to":2469.82,"location":2,"content":"the information in the cell is going to be preserved indefinitely over many time steps."},{"from":2469.82,"to":2473.11,"location":2,"content":"So, I don't know if that's actually a good strategy for whatever task you're trying to do,"},{"from":2473.11,"to":2475.99,"location":2,"content":"but my point is that there is at least, um,"},{"from":2475.99,"to":2480.68,"location":2,"content":"a fairly straightforward way for the LSTM to keep the information over many steps."},{"from":2480.68,"to":2485.18,"location":2,"content":"And as we noted that's relatively harder for the vanilla RNN to do."},{"from":2485.18,"to":2489.64,"location":2,"content":"So, you can think of this as the key reason why LSTMs are more able,"},{"from":2489.64,"to":2491.86,"location":2,"content":"ah, to preserve the information"},{"from":2491.86,"to":2494.82,"location":2,"content":"and thus are more robust to the vanishing gradient problem."},{"from":2494.82,"to":2498.59,"location":2,"content":"Ah, however, I think you should still know that LSTMs don't"},{"from":2498.59,"to":2502.18,"location":2,"content":"necessarily guarantee that we don't have a vanishing or exploding gradient problem."},{"from":2502.18,"to":2503.61,"location":2,"content":"You could still have that problem,"},{"from":2503.61,"to":2508.08,"location":2,"content":"but the thing to remember is that it's easier to avoid it anyway."},{"from":2508.08,"to":2511.42,"location":2,"content":"Okay. So, um, LSTMs, ah,"},{"from":2511.42,"to":2514.82,"location":2,"content":"have been shown to be more robust to the vanishing gradient problem,"},{"from":2514.82,"to":2517.18,"location":2,"content":"ah but I'm going to tell you a little about how they've"},{"from":2517.18,"to":2520.11,"location":2,"content":"actually been more successful in real life. You have a question?"},{"from":2520.11,"to":2541.6,"location":2,"content":"Yeah, [inaudible]"},{"from":2541.6,"to":2546.11,"location":2,"content":"Okay. So it's a great question."},{"from":2546.11,"to":2548.48,"location":2,"content":"The question is, why is it that just because you"},{"from":2548.48,"to":2551.39,"location":2,"content":"have these LSTM defined forward equations,"},{"from":2551.39,"to":2553.39,"location":2,"content":"why do you not have the vanishing gradient problem?"},{"from":2553.39,"to":2556.28,"location":2,"content":"Why does the- the logic about, ah,"},{"from":2556.28,"to":2560.06,"location":2,"content":"the chain rule kind of getting smaller and smaller or bigger and bigger not apply?"},{"from":2560.06,"to":2563.8,"location":2,"content":"So, I think the key here is that, um,"},{"from":2563.8,"to":2565.72,"location":2,"content":"in the vanilla RNN,"},{"from":2565.72,"to":2567.92,"location":2,"content":"the hidden states are kind of like a bottleneck, right?"},{"from":2567.92,"to":2570.62,"location":2,"content":"Like all gradients must pass through them."},{"from":2570.62,"to":2572.36,"location":2,"content":"So, if that gradient is small then,"},{"from":2572.36,"to":2574.59,"location":2,"content":"all downstream gradients will be small,"},{"from":2574.59,"to":2578.41,"location":2,"content":"whereas here you could regard the cell as being kind of like"},{"from":2578.41,"to":2580.82,"location":2,"content":"a shortcut connection at least in"},{"from":2580.82,"to":2584.3,"location":2,"content":"the case where the forget gate is set to remember things,"},{"from":2584.3,"to":2587.33,"location":2,"content":"um, then that's kind of like a shortcut connection where"},{"from":2587.33,"to":2591.02,"location":2,"content":"the cell will stay the same if you have the forget gate set to remember things."},{"from":2591.02,"to":2594.08,"location":2,"content":"So, if the cell is staying mostly the same,"},{"from":2594.08,"to":2597.53,"location":2,"content":"then you are not going to be,"},{"from":2597.53,"to":2600.2,"location":2,"content":"ah, having the vanishing gradient via the cell."},{"from":2600.2,"to":2602.51,"location":2,"content":"So, that means that to get a connection from"},{"from":2602.51,"to":2605.47,"location":2,"content":"the gradient of something in the future with respect to something in the past,"},{"from":2605.47,"to":2607.76,"location":2,"content":"there is a potential route for the gradient to"},{"from":2607.76,"to":2610.86,"location":2,"content":"go via the cell that doesn't necessarily vanish."},{"from":2610.86,"to":2612.86,"location":2,"content":"So in that, I have one more question."},{"from":2612.86,"to":2613.12,"location":2,"content":"Um-uh."},{"from":2613.12,"to":2629.83,"location":2,"content":"Since we have a shortcut [inaudible]"},{"from":2629.83,"to":2633.68,"location":2,"content":"So I think the question was how do you check that your gradients are correct given that"},{"from":2633.68,"to":2637.89,"location":2,"content":"there are now multiple routes for information to travel?"},{"from":2637.89,"to":2638.45,"location":2,"content":"Right."},{"from":2638.45,"to":2641.78,"location":2,"content":"So, I suppose this somewhat relates to what we talked about last time with"},{"from":2641.78,"to":2644.66,"location":2,"content":"the multivariable chain rule about what is"},{"from":2644.66,"to":2648.82,"location":2,"content":"the derivative of the loss with respect to a repeated weight matrix and we saw that,"},{"from":2648.82,"to":2650.68,"location":2,"content":"if there are multiple routes then"},{"from":2650.68,"to":2653.33,"location":2,"content":"the multivariable chain rule says that you add up the gradients."},{"from":2653.33,"to":2656.68,"location":2,"content":"So, if your question is how do you do the calculus correctly and make sure it's correct,"},{"from":2656.68,"to":2658.19,"location":2,"content":"I guess you just kind of apply"},{"from":2658.19,"to":2659.66,"location":2,"content":"the multi-variable chain rule and it's more"},{"from":2659.66,"to":2661.46,"location":2,"content":"complicated than assessing with the LSTMs."},{"from":2661.46,"to":2664.09,"location":2,"content":"Ah if you're using PyTorch 14 you do not have to do that yourself,"},{"from":2664.09,"to":2665.78,"location":2,"content":"if you're going to implement it yourself then,"},{"from":2665.78,"to":2667.53,"location":2,"content":"you might have a more difficult time."},{"from":2667.53,"to":2670.78,"location":2,"content":"Um, yeah. So, I guess, yeah."},{"from":2670.78,"to":2678.56,"location":2,"content":"Okay. All right, so, what do we get to. All right."},{"from":2678.56,"to":2681.39,"location":2,"content":"So, let's talk about LSTMs and how they work in the- in the real world."},{"from":2681.39,"to":2684.95,"location":2,"content":"So, in the pretty recent past,"},{"from":2684.95,"to":2688.49,"location":2,"content":"2013-2015 um LSTM started achieving a lot of state of"},{"from":2688.49,"to":2692.32,"location":2,"content":"the art results on a variety of different tasks including for example,"},{"from":2692.32,"to":2694.37,"location":2,"content":"handwriting recognition, speech recognition,"},{"from":2694.37,"to":2697.8,"location":2,"content":"machine translation, parsing, image captioning."},{"from":2697.8,"to":2699.38,"location":2,"content":"So, over this period,"},{"from":2699.38,"to":2702.02,"location":2,"content":"LSTMs became the dominant approach in a lot of"},{"from":2702.02,"to":2708.23,"location":2,"content":"these application areas because they worked convincingly a lot better than vanilla RNNs."},{"from":2708.23,"to":2710.7,"location":2,"content":"However, today in 2019,"},{"from":2710.7,"to":2713.47,"location":2,"content":"things changed pretty fast in deep learning."},{"from":2713.47,"to":2716.24,"location":2,"content":"So, other approaches for example,"},{"from":2716.24,"to":2718.4,"location":2,"content":"transformers which you're going to learn about later in the class."},{"from":2718.4,"to":2720.41,"location":2,"content":"Ah, in some of these application areas,"},{"from":2720.41,"to":2721.72,"location":2,"content":"they seem to have become,"},{"from":2721.72,"to":2723.93,"location":2,"content":"ah, the dominant approach."},{"from":2723.93,"to":2725.48,"location":2,"content":"So, to look into this,"},{"from":2725.48,"to":2729.35,"location":2,"content":"I had a look at WMT which is a machine translation conference and"},{"from":2729.35,"to":2733.86,"location":2,"content":"also competition where people submit their MT systems to be evaluated."},{"from":2733.86,"to":2735.62,"location":2,"content":"And I looked at the report,"},{"from":2735.62,"to":2739.76,"location":2,"content":"the summary report for WMT 2016 and in this report,"},{"from":2739.76,"to":2741.13,"location":2,"content":"I did a quick Ctrl+F,"},{"from":2741.13,"to":2744.01,"location":2,"content":"and I found the word RNN appeared 44 times."},{"from":2744.01,"to":2746.99,"location":2,"content":"So, it seems that most people entering this competition were building"},{"from":2746.99,"to":2750.8,"location":2,"content":"their MT systems based on RNNs and in particular LSTMs."},{"from":2750.8,"to":2752.95,"location":2,"content":"And then I looked at the report from 2018,"},{"from":2752.95,"to":2755.21,"location":2,"content":"just two years later and I found that the RNN,"},{"from":2755.21,"to":2759.78,"location":2,"content":"the word RNN only appeared nine times and the word transformer appeared 63 times,"},{"from":2759.78,"to":2762.16,"location":2,"content":"and in fact the organizers noted that everyone,"},{"from":2762.16,"to":2764.33,"location":2,"content":"well, most people seem to using transformers now."},{"from":2764.33,"to":2767.93,"location":2,"content":"So um, this shows that things change pretty fast in deep learning."},{"from":2767.93,"to":2771.35,"location":2,"content":"The thing that was hot and new just a few years ago um,"},{"from":2771.35,"to":2775.66,"location":2,"content":"is- is now being passed by perhaps by other kinds of approaches."},{"from":2775.66,"to":2777.26,"location":2,"content":"So, you're going to learn more about transformers"},{"from":2777.26,"to":2779.32,"location":2,"content":"later but I guess that gives you a kind of"},{"from":2779.32,"to":2784.39,"location":2,"content":"idea of where LSTMs are currently in applications."},{"from":2784.39,"to":2789.84,"location":2,"content":"Okay. So, the second kind of RNN we're going to learn about is gated recurrent units."},{"from":2789.84,"to":2792.61,"location":2,"content":"So, these fortunately are simpler than LSTMs,"},{"from":2792.61,"to":2796.07,"location":2,"content":"in fact that was the motivation for them being proposed."},{"from":2796.07,"to":2800.15,"location":2,"content":"They were proposed in 2014 as a way to try to retain"},{"from":2800.15,"to":2805.05,"location":2,"content":"the strengths of LSTMs by getting rid of any unnecessary complexities."},{"from":2805.05,"to":2806.82,"location":2,"content":"So, in a GRU,"},{"from":2806.82,"to":2808.52,"location":2,"content":"we don't have a cell state."},{"from":2808.52,"to":2810.47,"location":2,"content":"We again just have a hidden state."},{"from":2810.47,"to":2814.07,"location":2,"content":"But the thing it has in ah in common with LSTMs is that we're going to be"},{"from":2814.07,"to":2817.46,"location":2,"content":"using gates to control the flow of information."},{"from":2817.46,"to":2820.41,"location":2,"content":"So, here are the equations for GRU."},{"from":2820.41,"to":2822.89,"location":2,"content":"We start off with two gates."},{"from":2822.89,"to":2826.43,"location":2,"content":"So the first gate is called the update gate and this"},{"from":2826.43,"to":2831.05,"location":2,"content":"controls what parts of the hidden states are going to be updated versus preserved."},{"from":2831.05,"to":2833.75,"location":2,"content":"So, you can kind of view this as playing"},{"from":2833.75,"to":2837.17,"location":2,"content":"the role of both the forget gate and the input gate in"},{"from":2837.17,"to":2844.58,"location":2,"content":"the LSTM and it's computed in much the same way as the gates in the LSTM were."},{"from":2844.58,"to":2847.57,"location":2,"content":"The second gate is called the reset gate rt,"},{"from":2847.57,"to":2850.55,"location":2,"content":"and this gate is controlling which parts of"},{"from":2850.55,"to":2854.91,"location":2,"content":"the previous hidden state are going to be used to compute new contents."},{"from":2854.91,"to":2857.74,"location":2,"content":"So, you can think of the- the reset gate as kind of selecting"},{"from":2857.74,"to":2861.3,"location":2,"content":"which parts of the previous hidden states are useful versus not useful."},{"from":2861.3,"to":2865.01,"location":2,"content":"So, it's going to discard some things and select some other things."},{"from":2865.01,"to":2868.05,"location":2,"content":"Okay. So, here's how those gates get used."},{"from":2868.05,"to":2869.69,"location":2,"content":"Um, h tilde here."},{"from":2869.69,"to":2874.4,"location":2,"content":"This is you can think of it as the new hidden state contents and what's"},{"from":2874.4,"to":2876.44,"location":2,"content":"going on in that equation is that we are applying"},{"from":2876.44,"to":2879.41,"location":2,"content":"the reset gate to the previous hidden state ht minus"},{"from":2879.41,"to":2884.15,"location":2,"content":"one um and then putting all of that through some linear transformations and"},{"from":2884.15,"to":2887.42,"location":2,"content":"a tan H and then this gives us the new content"},{"from":2887.42,"to":2891.3,"location":2,"content":"which we want to write to the hidden cell."},{"from":2891.3,"to":2895.66,"location":2,"content":"And then lastly our new hidden cell is going to be a combination"},{"from":2895.66,"to":2900.07,"location":2,"content":"of ah this new content and the previous hidden state."},{"from":2900.07,"to":2904.79,"location":2,"content":"So, the important thing to notice here is that we have this one minus u and u term."},{"from":2904.79,"to":2907.37,"location":2,"content":"So um, it's kind of like a balance right?"},{"from":2907.37,"to":2911.08,"location":2,"content":"U is ah is setting the balance between"},{"from":2911.08,"to":2915.09,"location":2,"content":"preserving things from the previous hidden state versus writing new stuff."},{"from":2915.09,"to":2916.36,"location":2,"content":"So, whereas in the LSTM,"},{"from":2916.36,"to":2919.2,"location":2,"content":"those were two completely separate gates that could be whatever value."},{"from":2919.2,"to":2922.24,"location":2,"content":"Here we have this constraint that U is being uh, balanced."},{"from":2922.24,"to":2924.76,"location":2,"content":"So, if you have more of one, you have to have less of the other."},{"from":2924.76,"to":2931.16,"location":2,"content":"So, this is one way in which the creators of the GRU sought to make LSTMs more simple."},{"from":2931.16,"to":2934.52,"location":2,"content":"Was by having a single gate play both of these roles."},{"from":2934.52,"to":2939.41,"location":2,"content":"Okay. So, that's GRUs and I think it's a little less obvious just looking at it."},{"from":2939.41,"to":2944.87,"location":2,"content":"Why GRUs help the vanishing gradients problem because there is no explicit ah memory"},{"from":2944.87,"to":2946.52,"location":2,"content":"cell, like there is in LSTMs."},{"from":2946.52,"to":2950.36,"location":2,"content":"So, I think the way to look at this here is um GRUs,"},{"from":2950.36,"to":2952.16,"location":2,"content":"you can view this as also being a solution to"},{"from":2952.16,"to":2955.22,"location":2,"content":"the vanishing gradient problem because like LSTMs,"},{"from":2955.22,"to":2959.43,"location":2,"content":"GRUs make it easier to retain information ah long-term."},{"from":2959.43,"to":2961.24,"location":2,"content":"So, for example here,"},{"from":2961.24,"to":2965.09,"location":2,"content":"if the update gate ut is set to zero,"},{"from":2965.09,"to":2970.39,"location":2,"content":"then we're going to be ah keeping the hidden state the same on every step."},{"from":2970.39,"to":2973.64,"location":2,"content":"And again that's maybe not a good idea but at least that is a strategy you can easily"},{"from":2973.64,"to":2977.32,"location":2,"content":"do in order to retain information over long distances."},{"from":2977.32,"to":2980.42,"location":2,"content":"So that's kind of like- like the same explanation of how GRUs make it"},{"from":2980.42,"to":2986.41,"location":2,"content":"potentially easier for RNNs to retain information long-term."},{"from":2986.41,"to":2991.49,"location":2,"content":"Okay. So, we've learned about these two different types of RNNs. Yes."},{"from":2991.49,"to":3008.23,"location":2,"content":"[inaudible]"},{"from":3008.23,"to":3010.11,"location":2,"content":"I think the question was,"},{"from":3010.11,"to":3012.79,"location":2,"content":"if we view the two gates in the GRU, as being, uh,"},{"from":3012.79,"to":3020.44,"location":2,"content":"a precise, um, analogy to the gates in the LSTM or are they more of a fuzzy analogy."},{"from":3020.44,"to":3022.79,"location":2,"content":"I'd say probably more of a fuzzy analogy"},{"from":3022.79,"to":3026.08,"location":2,"content":"because there are other changes going on in here, like,"},{"from":3026.08,"to":3028.51,"location":2,"content":"for example, the fact that there's no separate, um,"},{"from":3028.51,"to":3032.26,"location":2,"content":"memory cell, it means they're not performing exactly the same functions."},{"from":3032.26,"to":3039.89,"location":2,"content":"Yeah. Okay. So, we've learned about LSTMs and GRUs which are both,"},{"from":3039.89,"to":3041.65,"location":2,"content":"um, more complicated forms of RNNs,"},{"from":3041.65,"to":3043.7,"location":2,"content":"more complicated than Vanilla RNNs."},{"from":3043.7,"to":3045.61,"location":2,"content":"And they are both,"},{"from":3045.61,"to":3048.86,"location":2,"content":"uh, more robust to the vanishing gradient problem."},{"from":3048.86,"to":3053.95,"location":2,"content":"So, um, it would be useful to know which of these should we be using in practice?"},{"from":3053.95,"to":3055.12,"location":2,"content":"Which one is more successful,"},{"from":3055.12,"to":3056.77,"location":2,"content":"the LSTM or GRU?"},{"from":3056.77,"to":3059.77,"location":2,"content":"Uh, so, I- I did a little reading and it looks like researchers have"},{"from":3059.77,"to":3062.89,"location":2,"content":"proposed a lot of different types of gated RNNs."},{"from":3062.89,"to":3064.45,"location":2,"content":"So, it's not just GRUs and LSTMs,"},{"from":3064.45,"to":3067.59,"location":2,"content":"there's many other papers with lots of other different variants."},{"from":3067.59,"to":3071.84,"location":2,"content":"Uh, but these are definitely the two that are most widely used."},{"from":3071.84,"to":3075.34,"location":2,"content":"And, ah, you can probably say that the biggest difference between the two, um,"},{"from":3075.34,"to":3077.92,"location":2,"content":"for sure is the fact that GRUs are simpler"},{"from":3077.92,"to":3081.11,"location":2,"content":"and quicker to compute and they have fewer parameters."},{"from":3081.11,"to":3083.5,"location":2,"content":"So, this makes an actual practical difference to you as, uh,"},{"from":3083.5,"to":3087.97,"location":2,"content":"a deep learning practitioner because if you build your net based on GRUs,"},{"from":3087.97,"to":3089.97,"location":2,"content":"then it's gonna be faster to run forwards and,"},{"from":3089.97,"to":3092.17,"location":2,"content":"you know, faster to train and so on."},{"from":3092.17,"to":3094.54,"location":2,"content":"So, other than that, there appears to be"},{"from":3094.54,"to":3098.68,"location":2,"content":"no very conclusive evidence that one of these LSTM or GRUs,"},{"from":3098.68,"to":3102.79,"location":2,"content":"uh, is consistently outperforming the other on lots of different tasks."},{"from":3102.79,"to":3105.95,"location":2,"content":"Uh, it seems that often, uh,"},{"from":3105.95,"to":3108.25,"location":2,"content":"sometimes GRUs do perform as well as LSTMs,"},{"from":3108.25,"to":3111.52,"location":2,"content":"but there are cases where one of them performs better than the other."},{"from":3111.52,"to":3113.44,"location":2,"content":"So, as a rule of thumb,"},{"from":3113.44,"to":3117.19,"location":2,"content":"it seems like LSTM is often a good default choice to start with, uh,"},{"from":3117.19,"to":3118.84,"location":2,"content":"especially if your data has"},{"from":3118.84,"to":3121.15,"location":2,"content":"particularly long dependencies because there's evidence to think"},{"from":3121.15,"to":3125.5,"location":2,"content":"that LSTMs might be slightly better at keeping information over very long distances."},{"from":3125.5,"to":3127.57,"location":2,"content":"And also, if you have a lot of training data,"},{"from":3127.57,"to":3129.67,"location":2,"content":"you might think that LSTMs are a better choice because they"},{"from":3129.67,"to":3132.34,"location":2,"content":"have more parameters which means that,"},{"from":3132.34,"to":3137.94,"location":2,"content":"um, maybe you need more train data to learn them."},{"from":3137.94,"to":3141.7,"location":2,"content":"So, a rule of thumb is that maybe you want to start with LSTMs"},{"from":3141.7,"to":3143.5,"location":2,"content":"and if you're happy with their performance and you're"},{"from":3143.5,"to":3145.68,"location":2,"content":"happy with how long it takes to train, then you stick with that."},{"from":3145.68,"to":3147.61,"location":2,"content":"But if you feel like you need it to be more efficient,"},{"from":3147.61,"to":3150.85,"location":2,"content":"then maybe you should switch to GRUs and see how that goes with the performance"},{"from":3150.85,"to":3154.69,"location":2,"content":"and if it's faster. All right."},{"from":3154.69,"to":3156.97,"location":2,"content":"So, um, we've talked so far about how"},{"from":3156.97,"to":3160.72,"location":2,"content":"the vanishing/exploding gradients are a problem that occur a lot in RNNs."},{"from":3160.72,"to":3162.41,"location":2,"content":"But, um, the question is,"},{"from":3162.41,"to":3163.74,"location":2,"content":"is it only an RNN problem?"},{"from":3163.74,"to":3166.33,"location":2,"content":"Does this occur in other kinds of neural networks as well?"},{"from":3166.33,"to":3168.11,"location":2,"content":"And the answer is,"},{"from":3168.11,"to":3170.17,"location":2,"content":"uh, no, it's not just an RNN problem."},{"from":3170.17,"to":3172.39,"location":2,"content":"In fact, vanishing and exploding gradients are a"},{"from":3172.39,"to":3175.15,"location":2,"content":"pretty significant problem for"},{"from":3175.15,"to":3178.42,"location":2,"content":"most neural architecture such as feed-forward and convolutional,"},{"from":3178.42,"to":3179.74,"location":2,"content":"especially when they're deep."},{"from":3179.74,"to":3182.41,"location":2,"content":"And this is a really serious problem because there's no point having"},{"from":3182.41,"to":3186.66,"location":2,"content":"a really cool neural architecture if you can't learn it efficiently because of the,"},{"from":3186.66,"to":3188.5,"location":2,"content":"uh, vanishing gradient problem."},{"from":3188.5,"to":3193.03,"location":2,"content":"So, in particular, uh, in these feed-forward and convolutional networks, uh,"},{"from":3193.03,"to":3195.28,"location":2,"content":"you often have a gradient becoming vanishingly"},{"from":3195.28,"to":3198.52,"location":2,"content":"small over back-propagation, uh, because of the Chain Rule,"},{"from":3198.52,"to":3200.2,"location":2,"content":"because of this multiplying by"},{"from":3200.2,"to":3202.39,"location":2,"content":"all these different intermediate gradients or"},{"from":3202.39,"to":3205.12,"location":2,"content":"sometimes due to your choice of non-linearity function."},{"from":3205.12,"to":3209.26,"location":2,"content":"So, if this happens, this means that your- the lower layers of your, let's say,"},{"from":3209.26,"to":3211.28,"location":2,"content":"convolutional or feed-forward network,"},{"from":3211.28,"to":3212.95,"location":2,"content":"they have a much smaller,"},{"from":3212.95,"to":3215.93,"location":2,"content":"uh, gradient than the high levels."},{"from":3215.93,"to":3219.97,"location":2,"content":"And this means that they get changed very slowly during SGD."},{"from":3219.97,"to":3221.29,"location":2,"content":"So, this means that, overall,"},{"from":3221.29,"to":3223.99,"location":2,"content":"your network is very slow to train because when you take updates,"},{"from":3223.99,"to":3227.08,"location":2,"content":"then your lower layers are changing very slowly."},{"from":3227.08,"to":3229.3,"location":2,"content":"So, one solution, uh,"},{"from":3229.3,"to":3231.28,"location":2,"content":"the kind of like a family of solutions that we've seen in"},{"from":3231.28,"to":3233.62,"location":2,"content":"recent years is that there's been lots of"},{"from":3233.62,"to":3238.72,"location":2,"content":"proposals for new types of deep feed-forward or convolutional architectures."},{"from":3238.72,"to":3242.74,"location":2,"content":"And what they do is, they add more direct connections in the network."},{"from":3242.74,"to":3244.33,"location":2,"content":"And the- the idea,"},{"from":3244.33,"to":3245.76,"location":2,"content":"kind of as we talked about before,"},{"from":3245.76,"to":3248.53,"location":2,"content":"is that if you add all of these direct connections between layers,"},{"from":3248.53,"to":3252.09,"location":2,"content":"like maybe not just adjacent layers but further apart layers,"},{"from":3252.09,"to":3254.68,"location":2,"content":"then it makes it much easier for the gradients to flow,"},{"from":3254.68,"to":3257.64,"location":2,"content":"and you're going to find it easier to train your network overall."},{"from":3257.64,"to":3259.66,"location":2,"content":"So, I'm going to show you some examples of these in"},{"from":3259.66,"to":3261.67,"location":2,"content":"particular because it's fairly likely you're going to"},{"from":3261.67,"to":3265.7,"location":2,"content":"run into these kinds of architectures when you're doing your projects and reading papers."},{"from":3265.7,"to":3269.41,"location":2,"content":"So, one example is something called residual connections or,"},{"from":3269.41,"to":3272.51,"location":2,"content":"uh, the network itself is sometimes referred to as ResNet."},{"from":3272.51,"to":3276.35,"location":2,"content":"And here we've got a figure from the related paper."},{"from":3276.35,"to":3281.07,"location":2,"content":"So, what's going on in this diagram is that you have, uh,"},{"from":3281.07,"to":3282.93,"location":2,"content":"the usual kind of you've got weight layer and"},{"from":3282.93,"to":3285.48,"location":2,"content":"a non-linearity which is ReLU, and another weight layer."},{"from":3285.48,"to":3289.18,"location":2,"content":"So, if you regard that function as being f of x, ah,"},{"from":3289.18,"to":3290.7,"location":2,"content":"what they're doing is instead of just, ah,"},{"from":3290.7,"to":3292.54,"location":2,"content":"transforming x to f of x,"},{"from":3292.54,"to":3295.15,"location":2,"content":"the- they're taking f of x plus x."},{"from":3295.15,"to":3298.43,"location":2,"content":"So they're adding this identity skip connection where"},{"from":3298.43,"to":3301.99,"location":2,"content":"the input x is skipped over those two layers and then,"},{"from":3301.99,"to":3305.47,"location":2,"content":"um, added to the output of the two layers."},{"from":3305.47,"to":3307.51,"location":2,"content":"So, the reason why this is a good idea,"},{"from":3307.51,"to":3310.15,"location":2,"content":"uh, also known as skip connections,"},{"from":3310.15,"to":3315.28,"location":2,"content":"is that the identity connection is going to preserve information by default, right?"},{"from":3315.28,"to":3318.01,"location":2,"content":"So, if you imagine perhaps if you, um,"},{"from":3318.01,"to":3320.11,"location":2,"content":"initialize your network and you"},{"from":3320.11,"to":3322.6,"location":2,"content":"initialize your weight layers to have small random values,"},{"from":3322.6,"to":3324.84,"location":2,"content":"then if they're small and kind of close to zero,"},{"from":3324.84,"to":3328.89,"location":2,"content":"then you're going to have something like a noisy identity function, right?"},{"from":3328.89,"to":3332.24,"location":2,"content":"So you're going to be preserving information by default through all of your layers."},{"from":3332.24,"to":3333.52,"location":2,"content":"And if you have a very deep network,"},{"from":3333.52,"to":3335.11,"location":2,"content":"that means that even often many,"},{"from":3335.11,"to":3340.32,"location":2,"content":"um, many layers, you're still gonna have something like your original input."},{"from":3340.32,"to":3344.22,"location":2,"content":"So, uh, the- the people who wrote this paper, they show that, uh,"},{"from":3344.22,"to":3346.66,"location":2,"content":"if you don't have something like skip connections then"},{"from":3346.66,"to":3349.57,"location":2,"content":"actually you can find that deep layers- uh,"},{"from":3349.57,"to":3353.24,"location":2,"content":"deep networks perform worse on some tasks than shallow networks."},{"from":3353.24,"to":3354.95,"location":2,"content":"Not because they're not expressive enough,"},{"from":3354.95,"to":3356.66,"location":2,"content":"but because they're too difficult to learn."},{"from":3356.66,"to":3358.28,"location":2,"content":"So, when you attempt to learn deep networks,"},{"from":3358.28,"to":3359.95,"location":2,"content":"it just doesn't learn effectively and you end up"},{"from":3359.95,"to":3361.8,"location":2,"content":"getting worse performance in the shallow network."},{"from":3361.8,"to":3363.07,"location":2,"content":"So, the people who wrote this paper,"},{"from":3363.07,"to":3365.05,"location":2,"content":"they show that when they add these skip connections,"},{"from":3365.05,"to":3367.11,"location":2,"content":"then they made the deep networks, uh,"},{"from":3367.11,"to":3372,"location":2,"content":"much more effective and they managed to get good performance."},{"from":3372,"to":3375.28,"location":2,"content":"Uh, so another example which kinda take this- this idea"},{"from":3375.28,"to":3378.22,"location":2,"content":"further is something called dense connections or DenseNet."},{"from":3378.22,"to":3379.59,"location":2,"content":"And again, this was, uh,"},{"from":3379.59,"to":3383.97,"location":2,"content":"something proposed I think in a feed-forward or or convolutional setting."},{"from":3383.97,"to":3386.77,"location":2,"content":"And, ah, it's just kind of the same as skip connections but except ,"},{"from":3386.77,"to":3388.03,"location":2,"content":"um, connects everything to everything."},{"from":3388.03,"to":3390.07,"location":2,"content":"So, add more of these skip connections kind of"},{"from":3390.07,"to":3392.68,"location":2,"content":"from all layers to all layers and they showed that this,"},{"from":3392.68,"to":3394.48,"location":2,"content":"uh, performs even better."},{"from":3394.48,"to":3397.45,"location":2,"content":"And, uh, the last one I want to talk about which I don't have a picture"},{"from":3397.45,"to":3400.21,"location":2,"content":"for is something called highway connections."},{"from":3400.21,"to":3403.18,"location":2,"content":"So, this is similar to the residual or skip connections."},{"from":3403.18,"to":3406.59,"location":2,"content":"Ah, but the idea is that instead of just adding your x,"},{"from":3406.59,"to":3408.64,"location":2,"content":"adding your identity, uh, connection,"},{"from":3408.64,"to":3412.06,"location":2,"content":"the idea is that you're gonna have a gate that controls the balance between, um,"},{"from":3412.06,"to":3415.96,"location":2,"content":"adding the identity and computing, ah, the transformation."},{"from":3415.96,"to":3418.41,"location":2,"content":"So, instead of f of x plus x, you're gonna have, you know,"},{"from":3418.41,"to":3419.99,"location":2,"content":"gate times f of x plus, you know,"},{"from":3419.99,"to":3422.11,"location":2,"content":"one minus gate times x, something like that."},{"from":3422.11,"to":3425.56,"location":2,"content":"Um, so, this work was actually inspired by LSTMs,"},{"from":3425.56,"to":3427.51,"location":2,"content":"but instead of applying it to a recurrent setting,"},{"from":3427.51,"to":3433.86,"location":2,"content":"they were seeking to apply it to a feed-forward setting."},{"from":3433.86,"to":3436.12,"location":2,"content":"Okay. I'm gonna keep going for now."},{"from":3436.12,"to":3439.39,"location":2,"content":"Um. So, overall the question was,"},{"from":3439.39,"to":3440.62,"location":2,"content":"you know, how much uh,"},{"from":3440.62,"to":3443.65,"location":2,"content":"vanishing and exploding gradients a problem outside of the setting of RNNs?"},{"from":3443.65,"to":3446.71,"location":2,"content":"And I think uh, the important takeaway is that it is a big problem"},{"from":3446.71,"to":3450.59,"location":2,"content":"but you should notice that it is particularly a problem for RNNs."},{"from":3450.59,"to":3454,"location":2,"content":"So, um, RNNs are particularly unstable and"},{"from":3454,"to":3457.53,"location":2,"content":"this is essentially due to the repeated multiplication by the same weight matrix."},{"from":3457.53,"to":3459.18,"location":2,"content":"If you remember from last time, um,"},{"from":3459.18,"to":3461.89,"location":2,"content":"the characteristic thing about RNNs that makes them recurrent is"},{"from":3461.89,"to":3464.77,"location":2,"content":"the fact that you are applying the same weight matrix over and over again."},{"from":3464.77,"to":3467.09,"location":2,"content":"So, this is actually the core reason"},{"from":3467.09,"to":3469.86,"location":2,"content":"why they are so prone to the vanishing and exploding gradients,"},{"from":3469.86,"to":3473.67,"location":2,"content":"and ah, you can see some more information about that in the paper."},{"from":3473.67,"to":3477.73,"location":2,"content":"Okay. So, I know there's been a lot of dense information today,"},{"from":3477.73,"to":3479.82,"location":2,"content":"a lot of um, lot of notation."},{"from":3479.82,"to":3481.69,"location":2,"content":"So, here's a recap, if I've lost you at any point."},{"from":3481.69,"to":3483.28,"location":2,"content":"Now's a good time to jump back in because it's gonna"},{"from":3483.28,"to":3485.51,"location":2,"content":"get a little easier to understand perhaps."},{"from":3485.51,"to":3487.6,"location":2,"content":"So, okay, recap. What have we learned about today?"},{"from":3487.6,"to":3490.43,"location":2,"content":"Um, the first thing we learned about was the vanishing gradient problem."},{"from":3490.43,"to":3491.83,"location":2,"content":"We learned uh, what it is."},{"from":3491.83,"to":3495.58,"location":2,"content":"We learned why it happens and we saw why it's bad for RNNs,"},{"from":3495.58,"to":3498.09,"location":2,"content":"for example, RNN language models."},{"from":3498.09,"to":3501.64,"location":2,"content":"Ah, and we also learned about LSTMs and GRUs which are"},{"from":3501.64,"to":3505.48,"location":2,"content":"more complicated RNNs and they use gates to control the flow of information."},{"from":3505.48,"to":3509.39,"location":2,"content":"And by doing that, they are more resilient to the vanishing gradient problem."},{"from":3509.39,"to":3511.39,"location":2,"content":"Okay. So, if the remainder of this lecture,"},{"from":3511.39,"to":3512.74,"location":2,"content":"I think we've got about 20 minutes left,"},{"from":3512.74,"to":3516.45,"location":2,"content":"ah, we're going to be learning about two more advanced type of RNNs."},{"from":3516.45,"to":3519.04,"location":2,"content":"So, the first one is bidirectional RNNs and that's all"},{"from":3519.04,"to":3523.11,"location":2,"content":"about information flowing left to right and right to left."},{"from":3523.11,"to":3524.91,"location":2,"content":"And then we're also going to learn about"},{"from":3524.91,"to":3529.78,"location":2,"content":"multi-layer RNNs which is when you apply multiple RNNs on top of each other."},{"from":3529.78,"to":3534.1,"location":2,"content":"So, I'd say that both of these are pretty simple conceptually."},{"from":3534.1,"to":3536.91,"location":2,"content":"Um, so it shouldn't be too hard to understand."},{"from":3536.91,"to":3540.16,"location":2,"content":"All right, so let's start with bidirectional RNNs."},{"from":3540.16,"to":3544.22,"location":2,"content":"Um, this is a picture which you saw at the end of last lecture."},{"from":3544.22,"to":3545.3,"location":2,"content":"So, if you remember,"},{"from":3545.3,"to":3547.69,"location":2,"content":"sentiment classification is the task when you have"},{"from":3547.69,"to":3550.15,"location":2,"content":"some kind of input sentence such as the movie was"},{"from":3550.15,"to":3555.46,"location":2,"content":"terribly exciting and you want to classify this as a positive or negative sentiment."},{"from":3555.46,"to":3559.68,"location":2,"content":"So, in this example, it should be seen as positive sentiment."},{"from":3559.68,"to":3563.65,"location":2,"content":"So, um, this is an example of how you might try to"},{"from":3563.65,"to":3566.86,"location":2,"content":"solve sentiment classification using a fairly simple RNN model."},{"from":3566.86,"to":3569.83,"location":2,"content":"Ah, here we're using the RNN as a kind of encoder of"},{"from":3569.83,"to":3572.89,"location":2,"content":"the sentence and the hidden states represent the sentence."},{"from":3572.89,"to":3575.74,"location":2,"content":"And we'll do some kind of combination of the hidden states to compute uh,"},{"from":3575.74,"to":3577.76,"location":2,"content":"what we think the sentiment is."},{"from":3577.76,"to":3580.16,"location":2,"content":"So, my question is, if we look at let's say,"},{"from":3580.16,"to":3584.35,"location":2,"content":"the hidden state that corresponds to the word terribly and we're regarding"},{"from":3584.35,"to":3586.42,"location":2,"content":"this hidden state as a representation of the word"},{"from":3586.42,"to":3589.51,"location":2,"content":"terribly in the context of the sentence."},{"from":3589.51,"to":3593.45,"location":2,"content":"So, for this reason we- we sometimes call hidden states in this kind of situation"},{"from":3593.45,"to":3595.93,"location":2,"content":"a contextual representation because the idea is that it's"},{"from":3595.93,"to":3599.78,"location":2,"content":"a representation of the word terribly in the context of the sentence."},{"from":3599.78,"to":3604.15,"location":2,"content":"So, thing to think about here is that this contextual representation,"},{"from":3604.15,"to":3607.16,"location":2,"content":"it only contains information about the left context."},{"from":3607.16,"to":3610.15,"location":2,"content":"So, for terribly, the left context is the words um,"},{"from":3610.15,"to":3613.12,"location":2,"content":"the movie was and this hidden state the one that's got"},{"from":3613.12,"to":3616.43,"location":2,"content":"a blue box around it has only seen information to the left."},{"from":3616.43,"to":3620.49,"location":2,"content":"It hasn't seen the information of the words exciting or exclamation mark."},{"from":3620.49,"to":3624.72,"location":2,"content":"So, what we're asking is what about the right context?"},{"from":3624.72,"to":3628.69,"location":2,"content":"The right context of terribly is- is what exciting and the exclamation mark."},{"from":3628.69,"to":3633.04,"location":2,"content":"And do we think that the right context is useful here?"},{"from":3633.04,"to":3635.23,"location":2,"content":"Do we think that this is something we want to know about?"},{"from":3635.23,"to":3637.41,"location":2,"content":"And I would argue that in this example,"},{"from":3637.41,"to":3641.7,"location":2,"content":"it is actually kind of important because we've got the phrase terribly exciting."},{"from":3641.7,"to":3644.83,"location":2,"content":"And if you look at the word terribly in isolation,"},{"from":3644.83,"to":3647.01,"location":2,"content":"terrible or terribly usually means something bad, right?"},{"from":3647.01,"to":3650.65,"location":2,"content":"But terribly exciting, you can mean something good because it just means very exciting."},{"from":3650.65,"to":3653.23,"location":2,"content":"So, if you know about the right context,"},{"from":3653.23,"to":3656.68,"location":2,"content":"the word exciting then this might quite significantly"},{"from":3656.68,"to":3658.9,"location":2,"content":"modify your perception of the meaning of the word"},{"from":3658.9,"to":3661.21,"location":2,"content":"terribly in the context of the sentence."},{"from":3661.21,"to":3663.79,"location":2,"content":"And especially given that we're trying to do sentiment classification,"},{"from":3663.79,"to":3665.82,"location":2,"content":"this is- this is kind of important."},{"from":3665.82,"to":3670.15,"location":2,"content":"So this motivates why you might want to have information"},{"from":3670.15,"to":3673.91,"location":2,"content":"from both the left and the right when you're making your representations."},{"from":3673.91,"to":3676,"location":2,"content":"Ah, if when you were a kid,"},{"from":3676,"to":3678.53,"location":2,"content":"your parents told you to look both ways before you cross the street."},{"from":3678.53,"to":3680.62,"location":2,"content":"You might regard it as the same kind of idea that there's"},{"from":3680.62,"to":3682.51,"location":2,"content":"useful information to the left and the right that"},{"from":3682.51,"to":3684.97,"location":2,"content":"you'd like to know about ah, before you do anything."},{"from":3684.97,"to":3687.93,"location":2,"content":"Okay. So that's the motivation and um,"},{"from":3687.93,"to":3691.9,"location":2,"content":"here is how a bidirectional RNN might work in practice."},{"from":3691.9,"to":3695.07,"location":2,"content":"I have a kind of accidentally festive color scheme here."},{"from":3695.07,"to":3698.76,"location":2,"content":"And so the idea is that you have two RNNs going on."},{"from":3698.76,"to":3702.88,"location":2,"content":"You have the forward RNN as before that encodes the sentence left to right."},{"from":3702.88,"to":3706,"location":2,"content":"And then separately, you also have a backwards RNN."},{"from":3706,"to":3709.14,"location":2,"content":"And this has completely separate weights to the forward RNN."},{"from":3709.14,"to":3712.66,"location":2,"content":"So, the backward RNN is just doing the same thing"},{"from":3712.66,"to":3716.11,"location":2,"content":"except that it's encoding the sequence from right to left."},{"from":3716.11,"to":3719.98,"location":2,"content":"So, each of the hidden states is computed based on the one to the right."},{"from":3719.98,"to":3722.5,"location":2,"content":"And then finally, you just take the hidden states from"},{"from":3722.5,"to":3726.7,"location":2,"content":"the two RNNs and then you concatenate them together and you've got your uh,"},{"from":3726.7,"to":3729.39,"location":2,"content":"your final kind of representations."},{"from":3729.39,"to":3732.03,"location":2,"content":"So, in particular, if we now think about"},{"from":3732.03,"to":3736.33,"location":2,"content":"this contextual representation of the word terribly in the context,"},{"from":3736.33,"to":3742.18,"location":2,"content":"um, this- this vector has information from both the left and the right, right?"},{"from":3742.18,"to":3744.24,"location":2,"content":"Because you had the forwards and backwards RNNs that"},{"from":3744.24,"to":3747.3,"location":2,"content":"respectively had information from both left and right."},{"from":3747.3,"to":3750.19,"location":2,"content":"So the idea is that these concatenated hidden states,"},{"from":3750.19,"to":3754.74,"location":2,"content":"those can be regarded as kind of like the outputs of the bidirectional RNN."},{"from":3754.74,"to":3756.46,"location":2,"content":"Like if you're going to use these hidden states for"},{"from":3756.46,"to":3758.68,"location":2,"content":"any kind of further computation, then ah,"},{"from":3758.68,"to":3760.78,"location":2,"content":"it's these concatenated hidden states that you are going to be"},{"from":3760.78,"to":3764.25,"location":2,"content":"passing on to the next part of the network."},{"from":3764.25,"to":3768.2,"location":2,"content":"Um, here- here are the equations that just say the same thing."},{"from":3768.2,"to":3771.79,"location":2,"content":"So, you have your forward RNN and here we've got ah,"},{"from":3771.79,"to":3773.99,"location":2,"content":"a notation that you might not have seen before"},{"from":3773.99,"to":3777.01,"location":2,"content":"this kind of notation where it says RNN and then in brackets,"},{"from":3777.01,"to":3780.78,"location":2,"content":"the previous hidden state and the input that's simply saying that you know,"},{"from":3780.78,"to":3784.18,"location":2,"content":"HT is computed from the previous hidden state and the input."},{"from":3784.18,"to":3788.59,"location":2,"content":"And RNN forward could be a vanilla or a GRU or an LSTM."},{"from":3788.59,"to":3790.81,"location":2,"content":"It doesn't really matter, we're looking at it abstractly."},{"from":3790.81,"to":3796.1,"location":2,"content":"So, you have these two separate RNNs,"},{"from":3796.1,"to":3799.54,"location":2,"content":"RNN forwards and RNN backwards and generally, these have separate weights."},{"from":3799.54,"to":3801.91,"location":2,"content":"Although I have seen some papers where they have shared weights."},{"from":3801.91,"to":3803.88,"location":2,"content":"So, it seems that sometimes that does work better,"},{"from":3803.88,"to":3806.79,"location":2,"content":"perhaps maybe when you have enough training data."},{"from":3806.79,"to":3812.02,"location":2,"content":"And then finally, we regard these concatenated hidden states which you might just"},{"from":3812.02,"to":3818.55,"location":2,"content":"notice ht as being like the hidden state of the bidirectional RNN."},{"from":3818.55,"to":3822.55,"location":2,"content":"So, um, the previous diagram is pretty unwieldy."},{"from":3822.55,"to":3824.39,"location":2,"content":"So here's a simplified diagram."},{"from":3824.39,"to":3826.24,"location":2,"content":"And this is probably the only kind of diagram you're going to"},{"from":3826.24,"to":3828.7,"location":2,"content":"see from now on to denote bidirectional RNNs."},{"from":3828.7,"to":3830.77,"location":2,"content":"Um, so, what we've done here is you've just"},{"from":3830.77,"to":3833.57,"location":2,"content":"made all of the horizontal arrows go left and right ah,"},{"from":3833.57,"to":3836.26,"location":2,"content":"to represent that this is a bidirectional RNN."},{"from":3836.26,"to":3840.37,"location":2,"content":"So, the other thing you should assume is that the hidden states depicted here, you know,"},{"from":3840.37,"to":3844.24,"location":2,"content":"these red- red trying- red rectangles with the dots."},{"from":3844.24,"to":3846.58,"location":2,"content":"You can assume that those are the concatenated forwards,"},{"from":3846.58,"to":3848.59,"location":2,"content":"backwards hidden states from the bidirectional RNN."},{"from":3848.59,"to":3856,"location":2,"content":"[inaudible]"},{"from":3856,"to":3858.41,"location":2,"content":"Okay. So the question is, um,"},{"from":3858.41,"to":3862.06,"location":2,"content":"would you train your forwards and backwards RNNs kind of separately,"},{"from":3862.06,"to":3863.89,"location":2,"content":"um, on some kind of task and then"},{"from":3863.89,"to":3866.66,"location":2,"content":"maybe concatenate them together once they're separately trained networks,"},{"from":3866.66,"to":3868.28,"location":2,"content":"or would you train them all together?"},{"from":3868.28,"to":3872.2,"location":2,"content":"Um, it seems to me that it's much more common to train them together,"},{"from":3872.2,"to":3875.23,"location":2,"content":"but I don- I don't think I've heard of anyone training them separately."},{"from":3875.23,"to":3877.27,"location":2,"content":"Uh, so yeah, it seems like the standard practice is usually"},{"from":3877.27,"to":3879.16,"location":2,"content":"to train them together. Does that make sense?"},{"from":3879.16,"to":3893.29,"location":2,"content":"[inaudible]."},{"from":3893.29,"to":3895.69,"location":2,"content":"So, let's suppose that we were trying to build"},{"from":3895.69,"to":3899.44,"location":2,"content":"a sentiment classification system using the bidirectional RNN."},{"from":3899.44,"to":3903.46,"location":2,"content":"Then what you do, which maybe I should have pictured but I didn't have space, is uh,"},{"from":3903.46,"to":3907.42,"location":2,"content":"you would do the same thing that you were doing with the unidirectional RNN, uh,"},{"from":3907.42,"to":3910.05,"location":2,"content":"which was, let's say an element y is min or max,"},{"from":3910.05,"to":3911.66,"location":2,"content":"um, to get your sentence encoding."},{"from":3911.66,"to":3916.95,"location":2,"content":"Maybe you just do that but over the concatenated, um, n states."},{"from":3916.95,"to":3920.56,"location":2,"content":"Okay. So, an important thing to note is that, uh,"},{"from":3920.56,"to":3923.02,"location":2,"content":"when talking about applying bidirectional RNNs,"},{"from":3923.02,"to":3926.93,"location":2,"content":"we've assumed that we actually have access to the entire input sequence."},{"from":3926.93,"to":3928.78,"location":2,"content":"So, we assume that we have the full sentence,"},{"from":3928.78,"to":3931.57,"location":2,"content":"uh, the movie was very exciting, and,"},{"from":3931.57,"to":3934.74,"location":2,"content":"uh, that, that was a necessary assumption in order to"},{"from":3934.74,"to":3937.93,"location":2,"content":"be able to run the forwards and the backwards RNN, right?"},{"from":3937.93,"to":3940.93,"location":2,"content":"Um, so there are some situations where you can't assume this."},{"from":3940.93,"to":3943.18,"location":2,"content":"Like, for example, in Language Modeling,"},{"from":3943.18,"to":3947.36,"location":2,"content":"you only have access to the left context kind of by definition of the task."},{"from":3947.36,"to":3948.99,"location":2,"content":"You only know the words that have come so far."},{"from":3948.99,"to":3950.41,"location":2,"content":"You don't know what's coming next."},{"from":3950.41,"to":3954.07,"location":2,"content":"So, you can't use a bidirectional RNN, uh,"},{"from":3954.07,"to":3955.53,"location":2,"content":"to do Language Modeling, uh,"},{"from":3955.53,"to":3957.76,"location":2,"content":"in the way that we've depicted here because uh,"},{"from":3957.76,"to":3959.82,"location":2,"content":"you don't have the full sequence."},{"from":3959.82,"to":3963.11,"location":2,"content":"However, if you do have access to the entire sequence."},{"from":3963.11,"to":3965.23,"location":2,"content":"Uh, so, for example, if you're doing any kind of encoding"},{"from":3965.23,"to":3967.49,"location":2,"content":"similar to the sentiment example,"},{"from":3967.49,"to":3971.68,"location":2,"content":"uh, then bidirectionally- bidirectionality is pretty powerful."},{"from":3971.68,"to":3974.82,"location":2,"content":"And you should probably regard it as a good thing to do by default uh,"},{"from":3974.82,"to":3976.87,"location":2,"content":"because it turns out that getting this information from"},{"from":3976.87,"to":3978.8,"location":2,"content":"both the left and the right, uh,"},{"from":3978.8,"to":3983.72,"location":2,"content":"makes it a lot easier to learn these more useful contextual representations."},{"from":3983.72,"to":3985.87,"location":2,"content":"So, in particular, as a preview of"},{"from":3985.87,"to":3988.03,"location":2,"content":"something you're going to learn about later in the class, uh,"},{"from":3988.03,"to":3990.61,"location":2,"content":"there's a model called BERT, B-E-R-T,"},{"from":3990.61,"to":3994.33,"location":2,"content":"and that stands for Bidirectional Encoder Representations from Transformers."},{"from":3994.33,"to":3996.01,"location":2,"content":"And this is a pretty recently."},{"from":3996.01,"to":3999.07,"location":2,"content":"Like, a few months ago, uh, proposed system,"},{"from":3999.07,"to":4002.46,"location":2,"content":"and it's this pre-trained contextual representation system."},{"from":4002.46,"to":4006.45,"location":2,"content":"Um, and it's heavily reliant on the idea of bidirectionality."},{"from":4006.45,"to":4008.76,"location":2,"content":"It turns out that the bidirectional, uh,"},{"from":4008.76,"to":4011.57,"location":2,"content":"nature of BERT is pretty important to its success."},{"from":4011.57,"to":4013.29,"location":2,"content":"So, you're gonna learn more about that later,"},{"from":4013.29,"to":4015.99,"location":2,"content":"but that's just an example of how bidirectionality can give you much"},{"from":4015.99,"to":4019.88,"location":2,"content":"more uh, powerful contextual representations."},{"from":4019.88,"to":4024.39,"location":2,"content":"Okay. So the last thing we're going to talk about today is multi-layer RNNs."},{"from":4024.39,"to":4028.8,"location":2,"content":"Uh, so you could regard RNNs as already being deep"},{"from":4028.8,"to":4034.2,"location":2,"content":"in some sense because you've already unrolled them over potentially very many timesteps,"},{"from":4034.2,"to":4036.63,"location":2,"content":"and you could regard that as a kind of depth, right?"},{"from":4036.63,"to":4039.39,"location":2,"content":"But there's another way that RNNs could be deep."},{"from":4039.39,"to":4045.21,"location":2,"content":"So, for example, if you applied multiple RNNs kind of one after another,"},{"from":4045.21,"to":4048.55,"location":2,"content":"then this would be a different way to make your RNN deep,"},{"from":4048.55,"to":4050.49,"location":2,"content":"and this is the idea between, uh,"},{"from":4050.49,"to":4053.78,"location":2,"content":"behind a multi-layer RNN."},{"from":4053.78,"to":4057.32,"location":2,"content":"So, the reason why you would want to do this is because uh,"},{"from":4057.32,"to":4060.66,"location":2,"content":"this might allow the network to compute more complex representations."},{"from":4060.66,"to":4063.84,"location":2,"content":"So, this is the logic betwe- behind deep networks in general."},{"from":4063.84,"to":4065.28,"location":2,"content":"So, if you're familiar with the idea of why"},{"from":4065.28,"to":4067.62,"location":2,"content":"deeper is better for let's say convolutional networks,"},{"from":4067.62,"to":4069.2,"location":2,"content":"then this is kind of the same logic."},{"from":4069.2,"to":4074.76,"location":2,"content":"It's saying that, uh, your lower RNNs might be computing lower-level features like,"},{"from":4074.76,"to":4076.77,"location":2,"content":"let's suppose maybe it's keeping track of syntax,"},{"from":4076.77,"to":4082.1,"location":2,"content":"and your higher level RNN's gonna compute higher-level features like maybe semantics."},{"from":4082.1,"to":4086.78,"location":2,"content":"And a note on terminology, these are sometimes called stacked RNNs."},{"from":4086.78,"to":4089.64,"location":2,"content":"So, this works much as you'd imagine."},{"from":4089.64,"to":4093.63,"location":2,"content":"So here's an example of how a multi-layer RNN might work."},{"from":4093.63,"to":4095.61,"location":2,"content":"Uh, if it's three layers."},{"from":4095.61,"to":4098.11,"location":2,"content":"So this is a unidirectional RNN,"},{"from":4098.11,"to":4100.53,"location":2,"content":"but it could be bidirectional,"},{"from":4100.53,"to":4103.68,"location":2,"content":"um, If you have access to the entire input sequence."},{"from":4103.68,"to":4109.29,"location":2,"content":"So, I guess the, the main thing is that the hidden states from one RNN layer are going to"},{"from":4109.29,"to":4115.4,"location":2,"content":"be used as the inputs to the RNN layer that's coming next."},{"from":4115.4,"to":4118.8,"location":2,"content":"Um, any questions on this?"},{"from":4118.8,"to":4125.27,"location":2,"content":"Yeah."},{"from":4125.27,"to":4126.45,"location":2,"content":"[inaudible]."},{"from":4126.45,"to":4129.45,"location":2,"content":"That's a great question. So the question I think it's about the order of computation."},{"from":4129.45,"to":4132.39,"location":2,"content":"What order will you compute all of these hidden states in?"},{"from":4132.39,"to":4136.1,"location":2,"content":"I suppose there's some flexibility, right?"},{"from":4136.1,"to":4139.64,"location":2,"content":"But you could compute all of the step one ones,"},{"from":4139.64,"to":4142.3,"location":2,"content":"like all of the V ones and then all of the movie ones,"},{"from":4142.3,"to":4145.97,"location":2,"content":"or you could do all of RNN layer one and then all of RNN layer two."},{"from":4145.97,"to":4148.88,"location":2,"content":"So, it's- I think that, um, when you- you know,"},{"from":4148.88,"to":4151.06,"location":2,"content":"call the PyTorch function to do a multi-layer RNN,"},{"from":4151.06,"to":4153.38,"location":2,"content":"it will do all of RNN layer one, then two, then three."},{"from":4153.38,"to":4154.58,"location":2,"content":"That's what I think happens."},{"from":4154.58,"to":4156.27,"location":2,"content":"But it seems like logically,"},{"from":4156.27,"to":4159,"location":2,"content":"there's no reason why you couldn't do it the other way."},{"from":4159,"to":4170.19,"location":2,"content":"Yep? [inaudible]."},{"from":4170.19,"to":4172.11,"location":2,"content":"Yes, yes. That's a great point as well."},{"from":4172.11,"to":4175.95,"location":2,"content":"Um, so uh, someone pointed out that if they were bidirectional,"},{"from":4175.95,"to":4177.48,"location":2,"content":"then you no longer have that flexibility."},{"from":4177.48,"to":4179.95,"location":2,"content":"You would have to do all of layer one before layer two."},{"from":4179.95,"to":4187.04,"location":2,"content":"Yeah, good point. Anyone else?"},{"from":4187.04,"to":4192.48,"location":2,"content":"Okay. Uh, so mostly RNNs in practice,"},{"from":4192.48,"to":4196.06,"location":2,"content":"um, this tends to perform pretty well,"},{"from":4196.06,"to":4198.09,"location":2,"content":"uh, in that when I look at, um,"},{"from":4198.09,"to":4200.97,"location":2,"content":"RNN-based systems that are doing very well on some kind of task,"},{"from":4200.97,"to":4204.33,"location":2,"content":"they usually are some kind of multi-layer RNN, um,"},{"from":4204.33,"to":4206.4,"location":2,"content":"but they certainly aren't as deep as"},{"from":4206.4,"to":4209.43,"location":2,"content":"the deep convolutional or feed-forward networks you might have seen in,"},{"from":4209.43,"to":4210.8,"location":2,"content":"for example, image tasks."},{"from":4210.8,"to":4212.55,"location":2,"content":"So whereas, you know, very deep convolutional networks,"},{"from":4212.55,"to":4214.47,"location":2,"content":"I think hundreds of layers now, um,"},{"from":4214.47,"to":4216.8,"location":2,"content":"you certainly aren't getting RNNs that are that deep."},{"from":4216.8,"to":4218.79,"location":2,"content":"So, for example, um,"},{"from":4218.79,"to":4222.3,"location":2,"content":"in this paper from, uh, Google, uh,"},{"from":4222.3,"to":4225.16,"location":2,"content":"they're doing this kind of large hyperparameter search for"},{"from":4225.16,"to":4229.74,"location":2,"content":"neural machine translation to find which kinds of hyperparameters work well for NMT."},{"from":4229.74,"to":4231.72,"location":2,"content":"And in this paper, they found that um,"},{"from":4231.72,"to":4234.16,"location":2,"content":"two to four layers was best for the encoder RNN,"},{"from":4234.16,"to":4236.28,"location":2,"content":"and four layers was best for the decoder RNN."},{"from":4236.28,"to":4239.43,"location":2,"content":"Uh, you'll find out more about what encoder and decoder mean next time."},{"from":4239.43,"to":4241.27,"location":2,"content":"Um, but those are fairly small numbers."},{"from":4241.27,"to":4243.33,"location":2,"content":"Although they did find that if you add these skip"},{"from":4243.33,"to":4245.35,"location":2,"content":"connections or these dense connections, um,"},{"from":4245.35,"to":4249.75,"location":2,"content":"then it makes it much easier to learn some even deeper RNNs more effectively,"},{"from":4249.75,"to":4251.06,"location":2,"content":"like, maybe up to eight layers,"},{"from":4251.06,"to":4253.6,"location":2,"content":"but these certainly aren'tx hundreds of layers deep."},{"from":4253.6,"to":4255.75,"location":2,"content":"And one of the reasons why, uh,"},{"from":4255.75,"to":4259.1,"location":2,"content":"RNNs don't tend to be nearly as deep as these other kinds of networks,"},{"from":4259.1,"to":4261.68,"location":2,"content":"is that because as we commented before,"},{"from":4261.68,"to":4263.2,"location":2,"content":"RNNs have to be computed, uh,"},{"from":4263.2,"to":4265.38,"location":2,"content":"sequentially; they can't be computed in parallel."},{"from":4265.38,"to":4267.33,"location":2,"content":"This means that they're pretty expensive to compute."},{"from":4267.33,"to":4269.52,"location":2,"content":"If you have this depth in like, two-dimensions,"},{"from":4269.52,"to":4273.68,"location":2,"content":"you have the depth over the timesteps and then the depth over the RNN layer is two,"},{"from":4273.68,"to":4275.16,"location":2,"content":"then it beco- it becomes very,"},{"from":4275.16,"to":4277.83,"location":2,"content":"very expensive to compute these, these RNNs."},{"from":4277.83,"to":4279.89,"location":2,"content":"So, that's another reason why they don't get very deep."},{"from":4279.89,"to":4283.29,"location":2,"content":"Uh, so again, we just mentioned transformers."},{"from":4283.29,"to":4285.16,"location":2,"content":"Uh, you gonna learn about transformers later."},{"from":4285.16,"to":4287.28,"location":2,"content":"But these, it seems, um,"},{"from":4287.28,"to":4290.4,"location":2,"content":"can be deeper fro- from what I can tell of,"},{"from":4290.4,"to":4291.9,"location":2,"content":"of what people are using these days."},{"from":4291.9,"to":4293.58,"location":2,"content":"Transformer-based networks can be pretty deep."},{"from":4293.58,"to":4295.53,"location":2,"content":"So, uh, but for example,"},{"from":4295.53,"to":4298.43,"location":2,"content":"there's a 24-layer version and a 12-layer version, um,"},{"from":4298.43,"to":4299.82,"location":2,"content":"and admittedly, that was trained by Google,"},{"from":4299.82,"to":4301.86,"location":2,"content":"and they have a lot of computational power."},{"from":4301.86,"to":4303.6,"location":2,"content":"Um, but I think part of the reason why"},{"from":4303.6,"to":4305.69,"location":2,"content":"these transformer-based networks can be quite deep,"},{"from":4305.69,"to":4308.22,"location":2,"content":"is that they have a lot of these skipping like connections."},{"from":4308.22,"to":4309.75,"location":2,"content":"In fact, the whole um,"},{"from":4309.75,"to":4312.57,"location":2,"content":"innovation of transformers is that they're built on a lot of, kind of,"},{"from":4312.57,"to":4317.52,"location":2,"content":"skip connections. Okay, any questions?"},{"from":4317.52,"to":4321.45,"location":2,"content":"We're almost done. Okay. All right."},{"from":4321.45,"to":4324.03,"location":2,"content":"So, uh, here's a summary of what we've learned today."},{"from":4324.03,"to":4325.97,"location":2,"content":"I know it's been a lot of information."},{"from":4325.97,"to":4331.53,"location":2,"content":"Um, but I think here are four practical takeaways from today that, uh,"},{"from":4331.53,"to":4333.8,"location":2,"content":"are probably useful to you in your projects,"},{"from":4333.8,"to":4334.92,"location":2,"content":"even if you, um,"},{"from":4334.92,"to":4337.8,"location":2,"content":"uh, even if you"},{"from":4337.8,"to":4341.13,"location":2,"content":"didn't find them very interesting in themselves they're probably pretty useful."},{"from":4341.13,"to":4344.19,"location":2,"content":"So, the first one is that LSTMs are very powerful."},{"from":4344.19,"to":4345.99,"location":2,"content":"They're certainly a lot powerful than,"},{"from":4345.99,"to":4347.91,"location":2,"content":"uh, more powerful than Vanila RNNs."},{"from":4347.91,"to":4351.45,"location":2,"content":"Um, GRUs are also more powerful than, uh, Vanila RNNs."},{"from":4351.45,"to":4354.21,"location":2,"content":"Uh, and the only difference that is consistently the"},{"from":4354.21,"to":4357.48,"location":2,"content":"same is that GRUs are faster than LSTMs."},{"from":4357.48,"to":4360.47,"location":2,"content":"The next one is that you should probably clip your gradients,"},{"from":4360.47,"to":4362.06,"location":2,"content":"because if you don't clip your gradients,"},{"from":4362.06,"to":4367.05,"location":2,"content":"you're in danger of walking off cliffs and then ending up with NaNs in your model."},{"from":4367.05,"to":4372.1,"location":2,"content":"Uh, the next tip is that bidirectionality is useful if you can apply it."},{"from":4372.1,"to":4376.13,"location":2,"content":"And, basically, anytime when you have access to the entire input sequence,"},{"from":4376.13,"to":4377.85,"location":2,"content":"you can apply bidirectionality,"},{"from":4377.85,"to":4379.79,"location":2,"content":"so you should probably do that by default."},{"from":4379.79,"to":4384.14,"location":2,"content":"And then the last tip is that multi-layer RNNs are pretty powerful."},{"from":4384.14,"to":4386.33,"location":2,"content":"And again, you should probably do that if you,"},{"from":4386.33,"to":4388.35,"location":2,"content":"uh, have enough computational power to do so."},{"from":4388.35,"to":4391.4,"location":2,"content":"But if you're going to make your multi-layer RNN pretty deep,"},{"from":4391.4,"to":4393.26,"location":2,"content":"then you might need skip connections."},{"from":4393.26,"to":4402.84,"location":2,"content":"All right. Thanks [NOISE]."}]} \ No newline at end of file diff --git a/bcc-en/8.bcc b/bcc-en/8.bcc new file mode 100644 index 0000000000000000000000000000000000000000..2634d155439cf227dd33566fcacf79f6b951aabf --- /dev/null +++ b/bcc-en/8.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":5.06,"to":8.97,"location":2,"content":"So welcome to, uh, the Machine Translation lecture"},{"from":8.97,"to":11.04,"location":2,"content":"which is kind of like a culmination"},{"from":11.04,"to":15.87,"location":2,"content":"of the sequence of three lectures on RNNs and related topics."},{"from":15.87,"to":18.69,"location":2,"content":"So let's have a few announcements first."},{"from":18.69,"to":20.91,"location":2,"content":"The first thing is that you probably noticed when he came in,"},{"from":20.91,"to":22.77,"location":2,"content":"we're taking attendance today."},{"from":22.77,"to":27.09,"location":2,"content":"Uh, so you need to sign in with the TAs who are outside the auditorium."},{"from":27.09,"to":28.96,"location":2,"content":"Uh, if you missed it,"},{"from":28.96,"to":30.54,"location":2,"content":"don't get up now, it's fine,"},{"from":30.54,"to":32.7,"location":2,"content":"there'll be time to sign in after the lecture."},{"from":32.7,"to":34.64,"location":2,"content":"Uh, and then if you have any kind of questions"},{"from":34.64,"to":36.65,"location":2,"content":"about special cases with the attendance policy,"},{"from":36.65,"to":38.87,"location":2,"content":"uh, you should check out the Piazza post that we put up"},{"from":38.87,"to":41.54,"location":2,"content":"last night with some clarifications."},{"from":41.54,"to":45.23,"location":2,"content":"Uh, the other reminder is assignment four content is going to be covered today."},{"from":45.23,"to":47.85,"location":2,"content":"So you're going to have everything you need to do assignment four at the end of today,"},{"from":47.85,"to":51.83,"location":2,"content":"and do get started early because the model takes four hours to train."},{"from":51.83,"to":54.71,"location":2,"content":"The other announcement is that we are going to be sending out"},{"from":54.71,"to":58.55,"location":2,"content":"a mid-quarter feedback survey sometime in the next few days probably."},{"from":58.55,"to":61.74,"location":2,"content":"Uh, so please do fill it out you'll get 0.5 percent credit,"},{"from":61.74,"to":66.92,"location":2,"content":"and you're also going to help us to make the course better, for the rest of the quarter."},{"from":66.92,"to":70.34,"location":2,"content":"Okay, so here's the overview of what we're going to do today."},{"from":70.34,"to":75.58,"location":2,"content":"Uh, today first we are going to introduce a new task in NLP which is machine translation."},{"from":75.58,"to":79.99,"location":2,"content":"And then we're going to introduce a new neural architecture called sequence-to-sequence."},{"from":79.99,"to":81.65,"location":2,"content":"And the connection here is that"},{"from":81.65,"to":85.53,"location":2,"content":"machine translation is a major use case of sequence-to-sequence."},{"from":85.53,"to":89.53,"location":2,"content":"After that, we're going to introduce a new neural technique called attention."},{"from":89.53,"to":93.39,"location":2,"content":"And this is something that improves sequence-to- sequence a lot."},{"from":93.39,"to":97.22,"location":2,"content":"Okay, so section one of this is gonna be about, uh,"},{"from":97.22,"to":102.34,"location":2,"content":"a bit of machine translation history, Pre-Neural Machine Translation."},{"from":102.34,"to":108.14,"location":2,"content":"So machine translation or MT is the task of translating a sentence X, uh,"},{"from":108.14,"to":110.06,"location":2,"content":"which we call the source language,"},{"from":110.06,"to":112.43,"location":2,"content":"whatever language you're translating from,"},{"from":112.43,"to":116.39,"location":2,"content":"into a sentence Y which is in another language which we call, the target language."},{"from":116.39,"to":117.65,"location":2,"content":"Uh, so here's an example,"},{"from":117.65,"to":119.77,"location":2,"content":"let's suppose X is this French sentence."},{"from":119.77,"to":122.06,"location":2,"content":"Um, could anyone in the audience,"},{"from":122.06,"to":124.22,"location":2,"content":"a French speaker translate to English for us."},{"from":124.22,"to":133.73,"location":2,"content":"[NOISE] Yeah."},{"from":133.73,"to":139.28,"location":2,"content":"The man is born free, and everywhere he is in irons."},{"from":139.28,"to":140.76,"location":2,"content":"Great. So that was something like,"},{"from":140.76,"to":142.77,"location":2,"content":"the man is born free, but everywhere he is in irons."},{"from":142.77,"to":144.38,"location":2,"content":"That was a fairly literal translation."},{"from":144.38,"to":147.77,"location":2,"content":"It's usually translated this quote by Rousseau is usually translated as."},{"from":147.77,"to":149.85,"location":2,"content":"Man is born free, but everywhere he is in chains."},{"from":149.85,"to":152.15,"location":2,"content":"But you know there's an ambiguity should fers be,"},{"from":152.15,"to":154.19,"location":2,"content":"um, literally irons or chains."},{"from":154.19,"to":156.41,"location":2,"content":"Also you could choose to, uh, translate"},{"from":156.41,"to":158.66,"location":2,"content":"L'homme as man or maybe humankind."},{"from":158.66,"to":162.31,"location":2,"content":"Uh, so this is an example of machine translation and there's already,"},{"from":162.31,"to":164.31,"location":2,"content":"you know, quite a few choices you can make."},{"from":164.31,"to":170.5,"location":2,"content":"So the beginning of machine translation as an AI task began in the early 1950's."},{"from":170.5,"to":172.34,"location":2,"content":"So, um, in particular,"},{"from":172.34,"to":175.01,"location":2,"content":"there was lots of work translating Russian to English, uh,"},{"from":175.01,"to":177.17,"location":2,"content":"because the West was very interested in listening"},{"from":177.17,"to":179.79,"location":2,"content":"to what the Russians were saying during the Cold War."},{"from":179.79,"to":185.3,"location":2,"content":"And we've got a fun video here which shows the state of machine translation in 1954."},{"from":185.3,"to":190.52,"location":2,"content":"[MUSIC] They hadn't reckoned with ambiguity when they set out"},{"from":190.52,"to":193.17,"location":2,"content":"to use computers to translate languages."},{"from":193.17,"to":200.84,"location":2,"content":"A $500,000 simple calculator, the most versatile electronic brain known, translates Russian into English."},{"from":200.84,"to":202.81,"location":2,"content":"Instead of mathematical wizardry,"},{"from":202.81,"to":204.62,"location":2,"content":"a sentence in Russian is to be fed in- [OVERLAPPING]"},{"from":204.62,"to":207.98,"location":2,"content":"One of the first non-numerical applications of computers,"},{"from":207.98,"to":209.48,"location":2,"content":"it was hyped as the solution to"},{"from":209.48,"to":213.37,"location":2,"content":"the Cold War obsession of keeping tabs on what the Russians were doing."},{"from":213.37,"to":217.25,"location":2,"content":"Claims were made that the computer would replace most human translators."},{"from":217.25,"to":220.24,"location":2,"content":"Professor, you're just in the experimental stage."},{"from":220.24,"to":223.24,"location":2,"content":"When you go in for full-scale production what will the capacity be?"},{"from":223.24,"to":228.24,"location":2,"content":"We should be able to do about, a little more than a conventional computer."},{"from":228.24,"to":231.24,"location":2,"content":"Uh, about 1 to 2 million words an hour."},{"from":231.24,"to":233.24,"location":2,"content":"And this will be quite an adequate speed to cope with the"},{"from":233.24,"to":236.24,"location":2,"content":"whole output of the Soviet Union in just a few hours"},{"from":236.24,"to":238.24,"location":2,"content":"of computer time a week."},{"from":238.24,"to":239.24,"location":2,"content":"When do you hope to be able to achieve this speed?"},{"from":239.24,"to":246.24,"location":2,"content":"If our experiments go well, then perhaps within five years or so."},{"from":246.24,"to":248.59,"location":2,"content":"So in this video I think there's a number of interesting things."},{"from":248.59,"to":253.53,"location":2,"content":"Um, firstly we can see an example of about how AI hype is nothing new."},{"from":253.53,"to":255.85,"location":2,"content":"Even in 1954 they were talking"},{"from":255.85,"to":260.13,"location":2,"content":"this machine translation system as if it was an electronic brain which I think,"},{"from":260.13,"to":262.24,"location":2,"content":"uh, overstates maybe how general it is."},{"from":262.24,"to":265.42,"location":2,"content":"Um, they were also at least some of them fairly optimistic that"},{"from":265.42,"to":270.52,"location":2,"content":"this machine translation system was going to be replacing humans, uh, anytime soon."},{"from":270.52,"to":273.67,"location":2,"content":"Um, so yeah that's- that's pretty interesting."},{"from":273.67,"to":278.13,"location":2,"content":"And, um, the thing is that these systems actually were mostly rule-based, uh,"},{"from":278.13,"to":280.74,"location":2,"content":"by which I mean that they were mostly using"},{"from":280.74,"to":283.36,"location":2,"content":"a bilingual dictionary between Russian and English,"},{"from":283.36,"to":286.44,"location":2,"content":"and they were essentially mostly just looking up the Russian words, uh,"},{"from":286.44,"to":287.82,"location":2,"content":"looking at the English counterparts,"},{"from":287.82,"to":291.35,"location":2,"content":"and they were storing these big bilingual dictionaries on these large magnetic tapes."},{"from":291.35,"to":295.36,"location":2,"content":"Um, so certainly it was a huge technical feat at the time, uh,"},{"from":295.36,"to":297.11,"location":2,"content":"but they, uh, some people were probably too"},{"from":297.11,"to":300.06,"location":2,"content":"optimistic about how quickly it would replace humans."},{"from":300.06,"to":303.21,"location":2,"content":"So jumping forward several decades in time,"},{"from":303.21,"to":306.33,"location":2,"content":"uh, and I want to tell you about statistical machine translation."},{"from":306.33,"to":310.04,"location":2,"content":"So the core idea of statistical machine translation is that you're going to"},{"from":310.04,"to":314.3,"location":2,"content":"learn a probabilistic model from the data in order to do the translation."},{"from":314.3,"to":319.28,"location":2,"content":"So as an example as before suppose that we're translating from French to English."},{"from":319.28,"to":323.21,"location":2,"content":"The idea here is that you want to find the best English sentence Y,"},{"from":323.21,"to":325.06,"location":2,"content":"given the French sentence X."},{"from":325.06,"to":328.13,"location":2,"content":"And mathematically you can formulate this as finding argmax"},{"from":328.13,"to":331.81,"location":2,"content":"Y of this conditional probability of Y given X."},{"from":331.81,"to":334.94,"location":2,"content":"And the model that you're learning is this probability distribution"},{"from":334.94,"to":340.81,"location":2,"content":"P. So what we usually do is we break down this probability into,"},{"from":340.81,"to":343.49,"location":2,"content":"uh, its two components using Bayes' Rule."},{"from":343.49,"to":346.97,"location":2,"content":"So this means that finding the Y that maximizes,"},{"from":346.97,"to":348.57,"location":2,"content":"uh, probability of Y given X,"},{"from":348.57,"to":351.41,"location":2,"content":"is equivalent to finding the Y that maximizes"},{"from":351.41,"to":355.24,"location":2,"content":"the probability of X given Y times the probability of Y."},{"from":355.24,"to":357.26,"location":2,"content":"So the two components here,"},{"from":357.26,"to":359.81,"location":2,"content":"on the left we have a translation model."},{"from":359.81,"to":363.32,"location":2,"content":"And this is keeping track of how words and phrases should be translated."},{"from":363.32,"to":366.53,"location":2,"content":"Uh, so the idea is that it knows, uh, how, uh,"},{"from":366.53,"to":369.77,"location":2,"content":"French words and the English words might be translated to each other"},{"from":369.77,"to":373.6,"location":2,"content":"or maybe small- small phrases and chunks of words should be translated."},{"from":373.6,"to":375.98,"location":2,"content":"And this is learned from a lot of parallel data,"},{"from":375.98,"to":377.76,"location":2,"content":"and I'll be telling you later how we do that."},{"from":377.76,"to":380.72,"location":2,"content":"The second component P of Y."},{"from":380.72,"to":382.63,"location":2,"content":"This is just a language model."},{"from":382.63,"to":383.98,"location":2,"content":"We learned about this last week."},{"from":383.98,"to":386.93,"location":2,"content":"A language model is a system that can predict the next word,"},{"from":386.93,"to":389.36,"location":2,"content":"but it can also be thought of as a system that tells"},{"from":389.36,"to":391.99,"location":2,"content":"you the probability of a sequence of words."},{"from":391.99,"to":394.61,"location":2,"content":"So here, if we're translating from French to English,"},{"from":394.61,"to":396.94,"location":2,"content":"P of Y is an English language model."},{"from":396.94,"to":400.34,"location":2,"content":"So the idea here is that the reason why we want to break down"},{"from":400.34,"to":404.36,"location":2,"content":"this single condition- conditional probability distribution into"},{"from":404.36,"to":409.49,"location":2,"content":"the product of two different ones is that this is a kind of division of labor."},{"from":409.49,"to":411.56,"location":2,"content":"The idea here is that instead of, uh,"},{"from":411.56,"to":414.41,"location":2,"content":"a single conditional probability distribution need to"},{"from":414.41,"to":417.53,"location":2,"content":"understand how to translate and how to write good English text,"},{"from":417.53,"to":420.05,"location":2,"content":"and understand sentence structure and everything at once."},{"from":420.05,"to":424.28,"location":2,"content":"The idea is that you separate it, so that the translation model on the left in blue"},{"from":424.28,"to":429.17,"location":2,"content":"mostly just knows about local translation of small chunks of words and phrases,"},{"from":429.17,"to":430.85,"location":2,"content":"whereas the language model on the right,"},{"from":430.85,"to":432.94,"location":2,"content":"more takes care of writing good English,"},{"from":432.94,"to":435.73,"location":2,"content":"good sentence structure, word order, and so on."},{"from":435.73,"to":438.05,"location":2,"content":"So you already know how to learn"},{"from":438.05,"to":440.3,"location":2,"content":"a language model because we learned about that last time,"},{"from":440.3,"to":441.98,"location":2,"content":"you just need lots of monolingual data,"},{"from":441.98,"to":443.57,"location":2,"content":"in this case, English data."},{"from":443.57,"to":445.49,"location":2,"content":"So I'm going to tell you more about how we would learn"},{"from":445.49,"to":450.42,"location":2,"content":"this translation model that needs to be learned from parallel data."},{"from":450.42,"to":453.83,"location":2,"content":"So, we need a large amount of parallel data"},{"from":453.83,"to":456.54,"location":2,"content":"in order to learn this, uh, translation model."},{"from":456.54,"to":459.7,"location":2,"content":"And an early example of a parallel corpus,"},{"from":459.7,"to":461.26,"location":2,"content":"is the Rosetta Stone."},{"from":461.26,"to":466.03,"location":2,"content":"So, this is a stone that has the same text written in three different languages."},{"from":466.03,"to":469.84,"location":2,"content":"And, uh, this is a hugely important artifact for, um,"},{"from":469.84,"to":473.59,"location":2,"content":"the- the people who were trying to understand ancient Egyptian."},{"from":473.59,"to":475.06,"location":2,"content":"So, in the 19th century,"},{"from":475.06,"to":476.67,"location":2,"content":"uh, scholars discovered this stone,"},{"from":476.67,"to":478.86,"location":2,"content":"and it helped them to figure out ancient Egyptian"},{"from":478.86,"to":481.44,"location":2,"content":"because there was this parallel text that had,"},{"from":481.44,"to":484.31,"location":2,"content":"uh, the same- the same text in other languages that they did know."},{"from":484.31,"to":487.58,"location":2,"content":"So, this is a- this is a really important parallel corpus."},{"from":487.58,"to":489.64,"location":2,"content":"And, uh, if you're ever in London you can go to"},{"from":489.64,"to":492.31,"location":2,"content":"the British Museum and see this in person."},{"from":492.31,"to":495.22,"location":2,"content":"So, the idea is that you get your parallel data."},{"from":495.22,"to":497.17,"location":2,"content":"Obviously you need a larger amount that's on the stone"},{"from":497.17,"to":499.57,"location":2,"content":"and hopefully it shouldn't be written on a stone either."},{"from":499.57,"to":505.11,"location":2,"content":"Uh, but you can use this to learn your statistical machine translation model."},{"from":505.11,"to":507.43,"location":2,"content":"So, the idea is that you are trying to learn"},{"from":507.43,"to":510.52,"location":2,"content":"this conditional probability distribution of X given Y."},{"from":510.52,"to":513.52,"location":2,"content":"So, what we do is we actually break this down even further."},{"from":513.52,"to":519.65,"location":2,"content":"We actually want to consider the probability of X and A given Y where A is the alignment."},{"from":519.65,"to":521.98,"location":2,"content":"So, the idea of alignment is this is, uh,"},{"from":521.98,"to":527.11,"location":2,"content":"how the words in the English sentence and the French sentence correspond to each other."},{"from":527.11,"to":530.68,"location":2,"content":"So, I'm gonna, uh, demonstrate this by an example."},{"from":530.68,"to":534.13,"location":2,"content":"So, in this example where we're translating"},{"from":534.13,"to":537.31,"location":2,"content":"the sentence \"Japan shaken by two new quakes\" to French."},{"from":537.31,"to":541.06,"location":2,"content":"Then you can see there's a pretty simple one-to-one alignment here,"},{"from":541.06,"to":545.14,"location":2,"content":"uh, of English words to French words and also they appear in the exact same order."},{"from":545.14,"to":549.85,"location":2,"content":"The only thing, uh, that doesn't conform to that is the word 'Le' in, uh,"},{"from":549.85,"to":552.46,"location":2,"content":"French which we call a spurious word because it doesn't"},{"from":552.46,"to":555.19,"location":2,"content":"have a direct counterpart in the English sentence."},{"from":555.19,"to":559.68,"location":2,"content":"And that's because in English we just say Japan but in French we say 'Le Japon'."},{"from":559.68,"to":562.91,"location":2,"content":"So, alignment can be a bit more complicated than that,"},{"from":562.91,"to":565.47,"location":2,"content":"for example alignment can be many-to-one."},{"from":565.47,"to":568.15,"location":2,"content":"In this example, you have, uh,"},{"from":568.15,"to":572.73,"location":2,"content":"several French words that have multiple English words that correspond to them."},{"from":572.73,"to":576.15,"location":2,"content":"So, this is what we call many-to-one alignments."},{"from":576.15,"to":578.46,"location":2,"content":"It can go in the other direction too,"},{"from":578.46,"to":580.06,"location":2,"content":"alignment can be one-to-many."},{"from":580.06,"to":583.54,"location":2,"content":"So, here we have a single English word implemented which has"},{"from":583.54,"to":584.92,"location":2,"content":"a one-to-many alignment because there's"},{"from":584.92,"to":588.33,"location":2,"content":"a three-word French phase- phrase that corresponds to it."},{"from":588.33,"to":589.9,"location":2,"content":"So, on the left and the right,"},{"from":589.9,"to":592.39,"location":2,"content":"we have two ways of depicting the same alignment."},{"from":592.39,"to":594.05,"location":2,"content":"It's either, uh, kind of,"},{"from":594.05,"to":597.99,"location":2,"content":"uh, charts or it can be a- a graph."},{"from":597.99,"to":601.33,"location":2,"content":"So, here's another example, um,"},{"from":601.33,"to":604.4,"location":2,"content":"of a one-to-many, uh, sorry,"},{"from":604.4,"to":608.2,"location":2,"content":"right so we call this word implemented, that is one-to-many."},{"from":608.2,"to":609.85,"location":2,"content":"We call it a fertile word,"},{"from":609.85,"to":614.08,"location":2,"content":"because the idea is that it has many children in the- in the target sentence."},{"from":614.08,"to":616.75,"location":2,"content":"So, in fact, there are some words which are very fertile."},{"from":616.75,"to":621.01,"location":2,"content":"Uh, here's an example where the source sentence 'il a m' entarte',"},{"from":621.01,"to":623.46,"location":2,"content":"uh, means he hit me with a pie."},{"from":623.46,"to":625.24,"location":2,"content":"And here in French,uh,"},{"from":625.24,"to":628.81,"location":2,"content":"this verb m' entarte means, uh, to hit someone with a pie."},{"from":628.81,"to":635.82,"location":2,"content":"And [LAUGHTER] this word has no single word equivalent in English."},{"from":635.82,"to":638.53,"location":2,"content":"We don't have a single verb that means to hit someone with a pie."},{"from":638.53,"to":641.7,"location":2,"content":"[NOISE] Um, which I think that's really fun that French has a word,"},{"from":641.7,"to":643.24,"location":2,"content":"you wonder maybe they do it so"},{"from":643.24,"to":645.01,"location":2,"content":"often that they need a single word for that, I don't know, [LAUGHTER]."},{"from":645.01,"to":648.76,"location":2,"content":"Um, so, this is an example of a fertile word, right?"},{"from":648.76,"to":654,"location":2,"content":"Because it needs to have several corresponding English words to translate it."},{"from":654,"to":657.38,"location":2,"content":"So, we can have one-to-many and many-to-one,"},{"from":657.38,"to":659.65,"location":2,"content":"you can also have many-to-many alignments."},{"from":659.65,"to":663.07,"location":2,"content":"You could call that kind of phrase level translation or phrase to phrase."},{"from":663.07,"to":666.1,"location":2,"content":"So, here, uh, the English sentence says,"},{"from":666.1,"to":670.74,"location":2,"content":"\"The poor don't have any money,\" and here don't have any money corresponds to the French,"},{"from":670.74,"to":672.38,"location":2,"content":"uh, phrase 'sont demunis'."},{"from":672.38,"to":674.47,"location":2,"content":"And this is a many-to-many alignment."},{"from":674.47,"to":678.75,"location":2,"content":"Because there's no obvious way to break down this phrase to phrase alignment into,"},{"from":678.75,"to":682.86,"location":2,"content":"uh, smaller word-to-word alignments."},{"from":682.86,"to":686.56,"location":2,"content":"Okay. So, that's what alignment is, and if you remember,"},{"from":686.56,"to":688.3,"location":2,"content":"we were thinking about how would you learn"},{"from":688.3,"to":690.96,"location":2,"content":"this probability distribution of what the alignment is,"},{"from":690.96,"to":693.94,"location":2,"content":"uh, in order to do statistical machine translation."},{"from":693.94,"to":696.82,"location":2,"content":"So, the idea is that you learn probability of x and"},{"from":696.82,"to":701.22,"location":2,"content":"a given y as a combination of many factors or many features."},{"from":701.22,"to":703.48,"location":2,"content":"So, you can say that for example, uh,"},{"from":703.48,"to":707.14,"location":2,"content":"what's the probability of a particular word aligning to another particular word?"},{"from":707.14,"to":708.91,"location":2,"content":"Like, you know, this English word and this French word,"},{"from":708.91,"to":710.05,"location":2,"content":"how often do they align?"},{"from":710.05,"to":712.06,"location":2,"content":"But then it also depends on for example,"},{"from":712.06,"to":713.74,"location":2,"content":"what's that position in the sentence?"},{"from":713.74,"to":716.71,"location":2,"content":"Like, uh, if they both appear near the end of the sentences,"},{"from":716.71,"to":718.48,"location":2,"content":"then it's more likely that they align,"},{"from":718.48,"to":721.86,"location":2,"content":"whereas if one's at the beginning and one's at the end, that's less likely."},{"from":721.86,"to":724.03,"location":2,"content":"You would also consider things like, uh,"},{"from":724.03,"to":728.08,"location":2,"content":"what's the probability of this particular French word having this particular fertility?"},{"from":728.08,"to":729.17,"location":2,"content":"Like, what's the, uh,"},{"from":729.17,"to":732.57,"location":2,"content":"probability of this word having three corresponding English words?"},{"from":732.57,"to":737.04,"location":2,"content":"And so on. So, all of these statistics are learned from your parallel data."},{"from":737.04,"to":740.24,"location":2,"content":"And there's many other things that you would take into consideration."},{"from":740.24,"to":743.86,"location":2,"content":"So, we're looking at a kind of overview of statistical machine translation today."},{"from":743.86,"to":746,"location":2,"content":"You're not going to understand it in full detail."},{"from":746,"to":748.12,"location":2,"content":"But we're understanding an overview of how it works,"},{"from":748.12,"to":749.7,"location":2,"content":"because we're going to be, uh,"},{"from":749.7,"to":753.51,"location":2,"content":"comparing it to neural machine translation."},{"from":753.51,"to":757.63,"location":2,"content":"Okay. So, we're learning this SMT system."},{"from":757.63,"to":760.83,"location":2,"content":"And so far we've broken it down into these two main components."},{"from":760.83,"to":763.9,"location":2,"content":"We've got the translation model and we've got the language model."},{"from":763.9,"to":766.93,"location":2,"content":"And, uh, we understand a little bit about how you might learn"},{"from":766.93,"to":770.34,"location":2,"content":"this translation model by breaking it down into alignments."},{"from":770.34,"to":774.02,"location":2,"content":"So, our question remains, how do you do the argmax over Y?"},{"from":774.02,"to":779.56,"location":2,"content":"How do you find your French sentence Y that maximizes this probability?"},{"from":779.56,"to":783.31,"location":2,"content":"So, one kind of brute force solution is you could say,"},{"from":783.31,"to":785.82,"location":2,"content":"uh, let's enumerate every possible Y,"},{"from":785.82,"to":789.46,"location":2,"content":"that's kind of every possible sequence of French words maybe up to some length,"},{"from":789.46,"to":792.43,"location":2,"content":"uh, and we'll calculate this probability for all of them."},{"from":792.43,"to":794.98,"location":2,"content":"And it should be pretty clear that that's just a no-go,"},{"from":794.98,"to":796.47,"location":2,"content":"that's way too expensive, uh,"},{"from":796.47,"to":799.52,"location":2,"content":"we're not going to be able to, uh, get anywhere with that."},{"from":799.52,"to":803.65,"location":2,"content":"So, the answer for how you actually do this in practice is you're going to use"},{"from":803.65,"to":808.13,"location":2,"content":"some kind of heuristic search algorithm to search for the best translation Y."},{"from":808.13,"to":810.67,"location":2,"content":"Uh, but along the way you're going to discard hypotheses"},{"from":810.67,"to":813.1,"location":2,"content":"that are too low proba- probability."},{"from":813.1,"to":816.85,"location":2,"content":"So, you're gonna search, but you're going to discard and prune the tree as you"},{"from":816.85,"to":820.51,"location":2,"content":"go to make sure that you're not keeping too many hypotheses, uh, on each step."},{"from":820.51,"to":827.68,"location":2,"content":"[NOISE] So, this process of finding your best sequence is also called decoding."},{"from":827.68,"to":830.5,"location":2,"content":"So, here's an overview of how that works for SMT."},{"from":830.5,"to":832.84,"location":2,"content":"Uh, this an example, uh,"},{"from":832.84,"to":837.16,"location":2,"content":"where you have this German sentence that translates to \"he does not go home\","},{"from":837.16,"to":841.93,"location":2,"content":"uh, and you can see that there's some kind of phrase to phrase alignments here."},{"from":841.93,"to":848,"location":2,"content":"So, uh, an overview of how this decoding would work in SMT is that you kind of"},{"from":848,"to":853.45,"location":2,"content":"consider lots of different hypotheses for how you might translate these individual words,"},{"from":853.45,"to":858.01,"location":2,"content":"uh, and then you build it up to consider how you might translate,"},{"from":858.01,"to":861.22,"location":2,"content":"uh, individual phrases and the phrases get bigger."},{"from":861.22,"to":864.19,"location":2,"content":"So, for example, you can see that on the top right if it's not too"},{"from":864.19,"to":867.19,"location":2,"content":"small you can see that the German word for house,"},{"from":867.19,"to":872.29,"location":2,"content":"uh, could be translated into the English word house or home or chamber and so on."},{"from":872.29,"to":876.34,"location":2,"content":"Uh, so we consider all of these different hypotheses and look into how we might"},{"from":876.34,"to":880.38,"location":2,"content":"put those together to translate the phrases but you don't keep all of them all the time."},{"from":880.38,"to":883.13,"location":2,"content":"You get rid of the ones that are too low probability."},{"from":883.13,"to":886.21,"location":2,"content":"So, this can also be depicted as a kind of a tree,"},{"from":886.21,"to":887.54,"location":2,"content":"where you are, uh,"},{"from":887.54,"to":889.81,"location":2,"content":"exploring different options, you're searching through"},{"from":889.81,"to":893.08,"location":2,"content":"the space of options but then you prune the tree as you go."},{"from":893.08,"to":894.79,"location":2,"content":"So, I know this is a very,"},{"from":894.79,"to":896.02,"location":2,"content":"very high level, uh,"},{"from":896.02,"to":897.57,"location":2,"content":"description of how decoding might work."},{"from":897.57,"to":899.08,"location":2,"content":"And in fact, later in this lecture,"},{"from":899.08,"to":900.77,"location":2,"content":"you're gonna see a detailed,"},{"from":900.77,"to":907.32,"location":2,"content":"um, explanation of how this kind of decoding works for neural machine translation."},{"from":907.32,"to":910.29,"location":2,"content":"Okay. So, what's our, um,"},{"from":910.29,"to":912.44,"location":2,"content":"overview of statistical machine translation?"},{"from":912.44,"to":913.73,"location":2,"content":"Uh, was it effective?"},{"from":913.73,"to":917.06,"location":2,"content":"Uh, so SMT was a huge research field,"},{"from":917.06,"to":920.4,"location":2,"content":"uh, from the 1990s to about maybe, uh, 2013."},{"from":920.4,"to":923.78,"location":2,"content":"And the best systems during this time were extremely complex,"},{"from":923.78,"to":926.71,"location":2,"content":"they were extremely sophisticated and impressive systems and"},{"from":926.71,"to":930.11,"location":2,"content":"SMT made the best machine translation systems in the world."},{"from":930.11,"to":931.82,"location":2,"content":"But they were very complex."},{"from":931.82,"to":932.98,"location":2,"content":"So, for example, you know,"},{"from":932.98,"to":936.58,"location":2,"content":"there were hundreds of important details that we haven't mentioned here at all."},{"from":936.58,"to":938.71,"location":2,"content":"There were many many techniques to make it, uh,"},{"from":938.71,"to":940.43,"location":2,"content":"more complex and more,"},{"from":940.43,"to":943.32,"location":2,"content":"um, sophisticated than what I've described today."},{"from":943.32,"to":948.54,"location":2,"content":"In particular, the systems had to have many separately designed, uh, subcomponents."},{"from":948.54,"to":952.88,"location":2,"content":"So, we already saw how you break down the translation model into two separate parts."},{"from":952.88,"to":954.05,"location":2,"content":"Uh, but there was, you know,"},{"from":954.05,"to":957.98,"location":2,"content":"many more sub-components than that and often they had to be learned separately."},{"from":957.98,"to":961.65,"location":2,"content":"This meant the engineers had to do a lot of feature engineering."},{"from":961.65,"to":963.71,"location":2,"content":"Uh, you have to design features to capture"},{"from":963.71,"to":967.2,"location":2,"content":"the particular language phenomena that you are interested in."},{"from":967.2,"to":970.22,"location":2,"content":"So, this meant that they had to require a lot"},{"from":970.22,"to":972.83,"location":2,"content":"of compiling and maintaining of extra resources."},{"from":972.83,"to":974.45,"location":2,"content":"And in fact, you had to have, uh,"},{"from":974.45,"to":976.43,"location":2,"content":"different resources for different languages."},{"from":976.43,"to":979.91,"location":2,"content":"So, the work kind of multiplied the more languages you had."},{"from":979.91,"to":981.23,"location":2,"content":"An example of this,"},{"from":981.23,"to":983.9,"location":2,"content":"is you had to have uh, tables of equivalent phrases."},{"from":983.9,"to":987.15,"location":2,"content":"So, for example, if you're doing French and English translation, then, uh,"},{"from":987.15,"to":990.32,"location":2,"content":"they would be collecting these phrases of, uh, sorry,"},{"from":990.32,"to":993.9,"location":2,"content":"these tables of phrases that they considered similar and those were learned from the data."},{"from":993.9,"to":997.36,"location":2,"content":"But this was a lot of information that had to be stored and maintained."},{"from":997.36,"to":1000.97,"location":2,"content":"So overall, this was just a lot of human effort to maintain."},{"from":1000.97,"to":1002.53,"location":2,"content":"Uh, and again yes,"},{"from":1002.53,"to":1005.13,"location":2,"content":"you had to put more human effort in if you wanted to"},{"from":1005.13,"to":1008.47,"location":2,"content":"learn an SMT system for a new language path."},{"from":1008.47,"to":1018.11,"location":2,"content":"Okay, are there any questions here about, uh, SMT?"},{"from":1018.11,"to":1021.66,"location":2,"content":"Okay. So, moving on, that's SMT."},{"from":1021.66,"to":1024.64,"location":2,"content":"Now we're going to move on to, uh, Section 2 of this lecture."},{"from":1024.64,"to":1029.91,"location":2,"content":"So, I want to take you back to the year 2014"},{"from":1029.91,"to":1032.37,"location":2,"content":"for a dramatic reenactment of what happened"},{"from":1032.37,"to":1035.4,"location":2,"content":"in the world of Machine Translation Research."},{"from":1035.4,"to":1037.8,"location":2,"content":"So, in 2014, something very dramatic happened,"},{"from":1037.8,"to":1040.93,"location":2,"content":"and that thing that happened is called Neural Machine Translation."},{"from":1040.93,"to":1043.68,"location":2,"content":"And I think it looked a little bit like this,"},{"from":1043.68,"to":1045.87,"location":2,"content":"if I'm not being too dramatic."},{"from":1045.87,"to":1048.81,"location":2,"content":"So, what is Neural Machine Translation?"},{"from":1048.81,"to":1052.58,"location":2,"content":"The idea is that NMT is a way to do machine translation,"},{"from":1052.58,"to":1055.74,"location":2,"content":"but using just a single neural network."},{"from":1055.74,"to":1059.7,"location":2,"content":"The neural network architecture that they used is called sequence-to-sequence,"},{"from":1059.7,"to":1061.23,"location":2,"content":"or sometime it's just called seq2seq,"},{"from":1061.23,"to":1064.21,"location":2,"content":"uh, and it involves two RNNs."},{"from":1064.21,"to":1066.93,"location":2,"content":"So, uh, it's called sequence-to-sequence because you're mapping"},{"from":1066.93,"to":1070.77,"location":2,"content":"one sequence to the other- the source sentence to the target sentence,"},{"from":1070.77,"to":1072.39,"location":2,"content":"and you need two RNNs, basically,"},{"from":1072.39,"to":1074.67,"location":2,"content":"to handle those two different sentences."},{"from":1074.67,"to":1079.1,"location":2,"content":"All right. Let's look at the diagram to see what sequence-to-sequence is in detail."},{"from":1079.1,"to":1083.37,"location":2,"content":"So, we start off with our source sentence and we're going to use our example from before,"},{"from":1083.37,"to":1085.38,"location":2,"content":"ah, il a m'entarte,"},{"from":1085.38,"to":1087.27,"location":2,"content":"which means he hit me with a pie."},{"from":1087.27,"to":1090.81,"location":2,"content":"So, we, uh, feed this into our encoder RNN,"},{"from":1090.81,"to":1092.82,"location":2,"content":"and, ah, this is- as you've seen before,"},{"from":1092.82,"to":1095.7,"location":2,"content":"I've drawn a unidirectional RNN,"},{"from":1095.7,"to":1097.26,"location":2,"content":"but this could be bidirectional,"},{"from":1097.26,"to":1099.05,"location":2,"content":"it also could be multi layer,"},{"from":1099.05,"to":1102.67,"location":2,"content":"it could be another or it could be LSTM and so on."},{"from":1102.67,"to":1108.24,"location":2,"content":"Another thing to note is that we're passing word embeddings into this encode RNN,"},{"from":1108.24,"to":1111,"location":2,"content":"but I'm just not explicitly depicting that step."},{"from":1111,"to":1113.25,"location":2,"content":"[NOISE] Okay."},{"from":1113.25,"to":1116.43,"location":2,"content":"So, the idea of the encoder RNN is that it's going to"},{"from":1116.43,"to":1119.79,"location":2,"content":"produce some kind of encoding of this source sentence."},{"from":1119.79,"to":1123.15,"location":2,"content":"So, for now, let's assume that the encoding of the source sentence is going"},{"from":1123.15,"to":1127.32,"location":2,"content":"to be the final hidden state of this encoder RNN."},{"from":1127.32,"to":1131.7,"location":2,"content":"So, what happens next is we pass this encoding of the source sentence."},{"from":1131.7,"to":1134.06,"location":2,"content":"We pass it over to the decoder RNN,"},{"from":1134.06,"to":1136.61,"location":2,"content":"which is going to translate into English."},{"from":1136.61,"to":1139.63,"location":2,"content":"So the decoder RNN is a language model,"},{"from":1139.63,"to":1141.78,"location":2,"content":"in particular, it's a conditional language model,"},{"from":1141.78,"to":1143.31,"location":2,"content":"like we talked about last time."},{"from":1143.31,"to":1145.44,"location":2,"content":"So, it's conditional because it's going to produce"},{"from":1145.44,"to":1148.45,"location":2,"content":"the target sentence but conditioned on this encoding."},{"from":1148.45,"to":1152.43,"location":2,"content":"And the encoding is that vector that has the orange box around it."},{"from":1152.43,"to":1156.72,"location":2,"content":"So, how does this work? Uh, we start off by feeding the start token"},{"from":1156.72,"to":1160.49,"location":2,"content":"into the decoder, and then, uh,"},{"from":1160.49,"to":1162.72,"location":2,"content":"we can get the first state of the decoder,"},{"from":1162.72,"to":1165.05,"location":2,"content":"because we're using the encoding of"},{"from":1165.05,"to":1168.74,"location":2,"content":"the source sentence as the initial hidden state for the decoder."},{"from":1168.74,"to":1171.24,"location":2,"content":"So then, we get our first output from"},{"from":1171.24,"to":1173.49,"location":2,"content":"the decoder which is a probability distribution of what"},{"from":1173.49,"to":1174.8,"location":2,"content":"word might come next."},{"from":1174.8,"to":1177.13,"location":2,"content":"And that's supposed to be taking the argmax over that,"},{"from":1177.13,"to":1178.89,"location":2,"content":"and then that gets us the word, uh,"},{"from":1178.89,"to":1183.36,"location":2,"content":"\"He\", which in this case is correct because that's probably the word you should start with."},{"from":1183.36,"to":1185.97,"location":2,"content":"Okay, so then we just take the word \"he\" and then we"},{"from":1185.97,"to":1188.79,"location":2,"content":"feed it back into the decoder on the next step."},{"from":1188.79,"to":1190.72,"location":2,"content":"And then we do the same thing again."},{"from":1190.72,"to":1193.96,"location":2,"content":"We take argmax and we get a new word, and we get, he hit."},{"from":1193.96,"to":1196.92,"location":2,"content":"The idea is that you can co- uh, continue doing this,"},{"from":1196.92,"to":1200.2,"location":2,"content":"ah, operation and in that way you're going to generate, uh,"},{"from":1200.2,"to":1202.34,"location":2,"content":"your target sentence, uh,"},{"from":1202.34,"to":1205.18,"location":2,"content":"which will be something like \"He hit me with a pie\","},{"from":1205.18,"to":1209.69,"location":2,"content":"and you stop once your decoder produces the end token."},{"from":1209.69,"to":1213.48,"location":2,"content":"So an important thing to note here is that"},{"from":1213.48,"to":1216.48,"location":2,"content":"this picture is showing you what happens at test time."},{"from":1216.48,"to":1218.64,"location":2,"content":"This shows you how to generate text."},{"from":1218.64,"to":1220.34,"location":2,"content":"This isn't what happens during training."},{"from":1220.34,"to":1222.32,"location":2,"content":"I'll show you what happens during training later."},{"from":1222.32,"to":1223.61,"location":2,"content":"Uh, but this thing with the,"},{"from":1223.61,"to":1225.99,"location":2,"content":"the pink dotted arrows where you feed the word back in,"},{"from":1225.99,"to":1229.24,"location":2,"content":"this is what you do to generate text at test-time."},{"from":1229.24,"to":1235.34,"location":2,"content":"Any questions on this? Uh, oh,"},{"from":1235.34,"to":1239.94,"location":2,"content":"another thing I should note is that you need two separate sets of word embeddings, right?"},{"from":1239.94,"to":1243.36,"location":2,"content":"You need word embeddings for French words and you need English word embeddings."},{"from":1243.36,"to":1244.65,"location":2,"content":"That's kind of two separate sets,"},{"from":1244.65,"to":1248.94,"location":2,"content":"two separate vocabularies, um, yeah."},{"from":1248.94,"to":1251.65,"location":2,"content":"Okay. So, as a side note,"},{"from":1251.65,"to":1254.58,"location":2,"content":"this architecture called sequence-to-sequence is actually pretty versatile."},{"from":1254.58,"to":1256.77,"location":2,"content":"It's not just a machine translation architecture."},{"from":1256.77,"to":1258.73,"location":2,"content":"Uh, you can, ah,"},{"from":1258.73,"to":1263.52,"location":2,"content":"phrase quite a few NLP tasks as sequence-to-sequence tasks."},{"from":1263.52,"to":1265.59,"location":2,"content":"So, for example, a summarization is"},{"from":1265.59,"to":1270.11,"location":2,"content":"a sequence-to-sequence task because in goes your long text and out comes your short text."},{"from":1270.11,"to":1272.61,"location":2,"content":"Uh, dialogue can be seq2seq because in"},{"from":1272.61,"to":1275.31,"location":2,"content":"goes the previous utterance and out comes your next utterance."},{"from":1275.31,"to":1279.9,"location":2,"content":"Uh, parsing can even be thought of as a sequence-to-sequence task because you could"},{"from":1279.9,"to":1284.71,"location":2,"content":"say in goes the input text and the output parse is going to be expressed as a sequence."},{"from":1284.71,"to":1286.35,"location":2,"content":"This might not be the best way to do parsing,"},{"from":1286.35,"to":1288.02,"location":2,"content":"but it is a way you can try."},{"from":1288.02,"to":1291.46,"location":2,"content":"Lastly, you could even do something like code generation."},{"from":1291.46,"to":1294.16,"location":2,"content":"So, suppose you want to build a system that takes some kind of"},{"from":1294.16,"to":1298.58,"location":2,"content":"natural language inputs such as: sum up the numbers from 1 to 10,"},{"from":1298.58,"to":1299.88,"location":2,"content":"and then outputs, let's say,"},{"from":1299.88,"to":1305.37,"location":2,"content":"some Python code that says some open brackets range 10, or something like that."},{"from":1305.37,"to":1307.17,"location":2,"content":"So, if you wanted to train,"},{"from":1307.17,"to":1309.38,"location":2,"content":"um, a system to do this, you could,"},{"from":1309.38,"to":1313.72,"location":2,"content":"in a way, view that as a translation task where you're translating from English to Python."},{"from":1313.72,"to":1315.96,"location":2,"content":"It's a pretty challenging translation task,"},{"from":1315.96,"to":1318.49,"location":2,"content":"it probably requires a lot more logic than just, uh, you know,"},{"from":1318.49,"to":1321.35,"location":2,"content":"French to English, but you can try and people have tried."},{"from":1321.35,"to":1329.51,"location":2,"content":"There are research papers where people have used seq2seq to do this kind of task."},{"from":1329.51,"to":1332.23,"location":2,"content":"Okay. So, to recap,"},{"from":1332.23,"to":1335.23,"location":2,"content":"seq2seq is an example of a conditional language model."},{"from":1335.23,"to":1337.95,"location":2,"content":"It's a language model because the decoder is"},{"from":1337.95,"to":1340.89,"location":2,"content":"a language model that's predicting the next target word."},{"from":1340.89,"to":1343.17,"location":2,"content":"But it's a conditional language model because it's"},{"from":1343.17,"to":1345.56,"location":2,"content":"also conditioning on your source sentence,"},{"from":1345.56,"to":1350.15,"location":2,"content":"which is represented by the encoding of the source sentence."},{"from":1350.15,"to":1352.39,"location":2,"content":"So, you could look- er,"},{"from":1352.39,"to":1353.7,"location":2,"content":"you could view it like this."},{"from":1353.7,"to":1356.7,"location":2,"content":"NMT is directly calculating the probability"},{"from":1356.7,"to":1359.82,"location":2,"content":"of the target sentence y given the source sentence x."},{"from":1359.82,"to":1362.41,"location":2,"content":"So, if you look at this, you see that this is just, uh,"},{"from":1362.41,"to":1365.09,"location":2,"content":"breaking down the probability of the sequence y,"},{"from":1365.09,"to":1366.62,"location":2,"content":"which we suppose is of length, uh,"},{"from":1366.62,"to":1370.08,"location":2,"content":"T. You can break it down into the being the probability of"},{"from":1370.08,"to":1374.19,"location":2,"content":"the first word of y given x and then the probability of the second word of y given,"},{"from":1374.19,"to":1376.9,"location":2,"content":"ah, the words that came before and x, and so on."},{"from":1376.9,"to":1380.34,"location":2,"content":"So, in fact, you can see that each of the terms in this product on the right,"},{"from":1380.34,"to":1382.68,"location":2,"content":"those are probabilities of the next target word,"},{"from":1382.68,"to":1385.37,"location":2,"content":"given all the ones so far and also the source sentence."},{"from":1385.37,"to":1389.65,"location":2,"content":"And that's exactly the conditional probability that your language model produces."},{"from":1389.65,"to":1391.82,"location":2,"content":"So, the reason I'm highlighting this,"},{"from":1391.82,"to":1392.88,"location":2,"content":"is because if you remember,"},{"from":1392.88,"to":1398.19,"location":2,"content":"in SMT, we didn't directly learn the translation model P of y given x."},{"from":1398.19,"to":1402.21,"location":2,"content":"We broke it down into smaller components."},{"from":1402.21,"to":1403.99,"location":2,"content":"Whereas here in NMT,"},{"from":1403.99,"to":1406.5,"location":2,"content":"we are directly learning this model."},{"from":1406.5,"to":1409.34,"location":2,"content":"And this is in some ways an advantage because it's simpler to do."},{"from":1409.34,"to":1412.55,"location":2,"content":"You don't have to learn all these different systems and optimize them separately."},{"from":1412.55,"to":1416.69,"location":2,"content":"It's, uh, kind of simpler and easier."},{"from":1416.69,"to":1419.85,"location":2,"content":"So, uh, this is- this is the model that we're learning."},{"from":1419.85,"to":1421.05,"location":2,"content":"Uh, the question is,"},{"from":1421.05,"to":1423.16,"location":2,"content":"'how do we train this NMT system?'"},{"from":1423.16,"to":1426.16,"location":2,"content":"So, hopefully you should already have a good idea of how this would work"},{"from":1426.16,"to":1429.05,"location":2,"content":"given that we've already seen how you would train a language model."},{"from":1429.05,"to":1430.8,"location":2,"content":"But here are the details, just in case."},{"from":1430.8,"to":1433.56,"location":2,"content":"So, you get your big co- parallel corpus, uh,"},{"from":1433.56,"to":1438.87,"location":2,"content":"and then, uh, let's say you have your sentence pair from your parallel corpus."},{"from":1438.87,"to":1441.9,"location":2,"content":"Uh, so this is what happens during training."},{"from":1441.9,"to":1445.59,"location":2,"content":"You feed your source sentence into the encoder RNN, ah,"},{"from":1445.59,"to":1449.46,"location":2,"content":"and then you feed your target sentence into the decoder RNN,"},{"from":1449.46,"to":1450.72,"location":2,"content":"and you're going to pass over"},{"from":1450.72,"to":1454.01,"location":2,"content":"that final hidden state to be the initial hidden state of the decoder."},{"from":1454.01,"to":1458.43,"location":2,"content":"And then for every step of the decoder RNN,"},{"from":1458.43,"to":1459.99,"location":2,"content":"you're going to produce the, ah,"},{"from":1459.99,"to":1462.03,"location":2,"content":"probability distribution of what comes next,"},{"from":1462.03,"to":1463.77,"location":2,"content":"which is, ah, the y hats."},{"from":1463.77,"to":1465.38,"location":2,"content":"And then from those,"},{"from":1465.38,"to":1466.88,"location":2,"content":"you can compute your loss,"},{"from":1466.88,"to":1469.32,"location":2,"content":"and the loss is just the same as we saw for,"},{"from":1469.32,"to":1471.15,"location":2,"content":"um, unconditional language models."},{"from":1471.15,"to":1474.02,"location":2,"content":"It's, uh, the cross entropy or you could also say"},{"from":1474.02,"to":1477.33,"location":2,"content":"negative log-likelihood of the true next word."},{"from":1477.33,"to":1479.71,"location":2,"content":"So, for example, on the selected ones, uh,"},{"from":1479.71,"to":1485.31,"location":2,"content":"the loss is the negative log probability of the correct next word [NOISE]."},{"from":1485.31,"to":1487.68,"location":2,"content":"And then, as before, we're going to average all of"},{"from":1487.68,"to":1492.41,"location":2,"content":"these losses to get the total loss for the example."},{"from":1492.41,"to":1495.87,"location":2,"content":"So, I think you might notice people saying in, for example,"},{"from":1495.87,"to":1498.75,"location":2,"content":"research papers is this phrase end-to-end."},{"from":1498.75,"to":1502.41,"location":2,"content":"And, uh, this is an example of learning a system end-to-end."},{"from":1502.41,"to":1504.18,"location":2,"content":"And what we mean by this is that,"},{"from":1504.18,"to":1506.61,"location":2,"content":"the back propagation is happening end-to-end."},{"from":1506.61,"to":1508.99,"location":2,"content":"One end is- is losses, the loss functions,"},{"from":1508.99,"to":1512.92,"location":2,"content":"and the other end I guess is kind of like the- the beginning of the encoder RNN."},{"from":1512.92,"to":1514.21,"location":2,"content":"The point is that you,"},{"from":1514.21,"to":1515.92,"location":2,"content":"um, back propagation, uh,"},{"from":1515.92,"to":1519.19,"location":2,"content":"flows throughout the entire system and you learn"},{"from":1519.19,"to":1523.2,"location":2,"content":"the entire system with respect to this single loss. Yeah."},{"from":1523.2,"to":1529.71,"location":2,"content":"If the decoder outputs the [NOISE] how you can't handle the,"},{"from":1529.71,"to":1534.38,"location":2,"content":"the loss, would it get to accrue by [inaudible]"},{"from":1534.38,"to":1541.04,"location":2,"content":"The question is if the decoder RNN outputs the end token too early,"},{"from":1541.04,"to":1543.84,"location":2,"content":"then how can you measure the loss on,"},{"from":1543.84,"to":1545.7,"location":2,"content":"uh, the words that came after that?"},{"from":1545.7,"to":1548.85,"location":2,"content":"So this is the difference between training time and test time,"},{"from":1548.85,"to":1549.99,"location":2,"content":"which is pretty confusing."},{"from":1549.99,"to":1555.96,"location":2,"content":"So, uh, during training we have this picture where you feed the token back in."},{"from":1555.96,"to":1558.16,"location":2,"content":"So in this scenario once you produce emds,"},{"from":1558.16,"to":1562.08,"location":2,"content":"then you have to stop because you can't feed end in as the input on the next step."},{"from":1562.08,"to":1567.05,"location":2,"content":"But in training you don't feed the thing that you've produced into the next step."},{"from":1567.05,"to":1571.05,"location":2,"content":"During training you feed the target sentence from the corpus."},{"from":1571.05,"to":1574.42,"location":2,"content":"So like the goal target sentence into the model."},{"from":1574.42,"to":1576.72,"location":2,"content":"So, no matter what the, uh,"},{"from":1576.72,"to":1579.21,"location":2,"content":"the decoder predicts on a step,"},{"from":1579.21,"to":1584.15,"location":2,"content":"you kind of- you don't use that for anything other than computing loss."},{"from":1584.15,"to":1586.74,"location":2,"content":"Any other questions? Uh, yeah."},{"from":1586.74,"to":1589.98,"location":2,"content":"Is there a reason why you would- back propagate"},{"from":1589.98,"to":1593.89,"location":2,"content":"end-to-end instead of maybe training an encoder on the English model,"},{"from":1593.89,"to":1597.69,"location":2,"content":"and then, try maybe training a decoder separately and then putting them together?"},{"from":1597.69,"to":1601.53,"location":2,"content":"The question is, is there a reason why you would want to train end-to-end when,"},{"from":1601.53,"to":1605.2,"location":2,"content":"for example, you might want to train the encoder and the decoder separately?"},{"from":1605.2,"to":1608.58,"location":2,"content":"Uh, so I think, uh, people view training end-to-end is favorable,"},{"from":1608.58,"to":1612,"location":2,"content":"because the idea is that you can optimize the system as a whole."},{"from":1612,"to":1614.61,"location":2,"content":"You might think that if you optimize the parts separately,"},{"from":1614.61,"to":1618.08,"location":2,"content":"then when you put them together they will not be optimal together necessarily."},{"from":1618.08,"to":1621.96,"location":2,"content":"So, if possible, directly optimizing the thing that you care about- about,"},{"from":1621.96,"to":1623.46,"location":2,"content":"with respect to all of the parameters,"},{"from":1623.46,"to":1625.11,"location":2,"content":"is more likely to succeed."},{"from":1625.11,"to":1627.4,"location":2,"content":"However, there is a notion of pre-training,"},{"from":1627.4,"to":1629.84,"location":2,"content":"and as you said, maybe you'd want to learn your, um,"},{"from":1629.84,"to":1632.34,"location":2,"content":"decoder RNN as a kind of,"},{"from":1632.34,"to":1633.57,"location":2,"content":"uh, a language model,"},{"from":1633.57,"to":1635.3,"location":2,"content":"an unconditional language model by itself."},{"from":1635.3,"to":1636.81,"location":2,"content":"And that's something that people do."},{"from":1636.81,"to":1640.5,"location":2,"content":"You might, uh, learn a very strong language model and then use that to"},{"from":1640.5,"to":1644.6,"location":2,"content":"initialize your decoder RNN and then fine tune it on your task."},{"from":1644.6,"to":1648.83,"location":2,"content":"That's a- a valid thing you might try to do. Yep."},{"from":1648.83,"to":1651,"location":2,"content":"So, are these windows fixed?"},{"from":1651,"to":1656.43,"location":2,"content":"Like, are you always feeding the RNN [inaudible] to decode?"},{"from":1656.43,"to":1660.51,"location":2,"content":"The question is, is the length of the source sentence"},{"from":1660.51,"to":1662.28,"location":2,"content":"and the length of the target sentence fixed?"},{"from":1662.28,"to":1664.71,"location":2,"content":"So, for example, is this source sentence always length four?"},{"from":1664.71,"to":1667.32,"location":2,"content":"Um, no, that's definitely not true because in"},{"from":1667.32,"to":1669.9,"location":2,"content":"your parallel corpus you're gonna have sentences of all lengths."},{"from":1669.9,"to":1673.72,"location":2,"content":"Uh, so this is more kind of an implementation or a practicality question."},{"from":1673.72,"to":1676.8,"location":2,"content":"Uh, the idea is that, this is what you mathematically want to be"},{"from":1676.8,"to":1681.02,"location":2,"content":"computing during training for each example and you're going to have batches of examples."},{"from":1681.02,"to":1684.88,"location":2,"content":"But the question is, how do you actually implement that in, uh, in practice?"},{"from":1684.88,"to":1686.43,"location":2,"content":"So what you usually do,"},{"from":1686.43,"to":1689.07,"location":2,"content":"just because it's easier to assume that your batch is"},{"from":1689.07,"to":1692.54,"location":2,"content":"this kind of even sized tensor where everything is the same length,"},{"from":1692.54,"to":1695.2,"location":2,"content":"is you pad any short sentences,"},{"from":1695.2,"to":1697.52,"location":2,"content":"up to some predefined maximum length or"},{"from":1697.52,"to":1700.78,"location":2,"content":"maybe the length of the maximum example in your batch,"},{"from":1700.78,"to":1705.47,"location":2,"content":"and then you make sure that you don't use any hidden states that"},{"from":1705.47,"to":1711.93,"location":2,"content":"came from the padding. Yep."},{"from":1711.93,"to":1723.6,"location":2,"content":"Wouldn't you have to train every two images together,"},{"from":1723.6,"to":1728.68,"location":2,"content":"that would be kind of universal between similar languages or something like that?"},{"from":1728.68,"to":1730.26,"location":2,"content":"Okay, so the question,"},{"from":1730.26,"to":1731.94,"location":2,"content":"I think, is, um,"},{"from":1731.94,"to":1734.07,"location":2,"content":"it seems like sometimes you wouldn't want to train"},{"from":1734.07,"to":1736.17,"location":2,"content":"things end-to-end and there are circumstances in which you"},{"from":1736.17,"to":1740.2,"location":2,"content":"might want to train things separately and you mentioned for example having,"},{"from":1740.2,"to":1741.81,"location":2,"content":"uh, different languages mapped to each other."},{"from":1741.81,"to":1744.16,"location":2,"content":"So this is a totally valid point and in fact,"},{"from":1744.16,"to":1746.79,"location":2,"content":"so far we've kind of assumed that you want to learn"},{"from":1746.79,"to":1749.41,"location":2,"content":"language A to language B as a pair, right?"},{"from":1749.41,"to":1753.08,"location":2,"content":"And that's different to language A to language C or even language B to language A."},{"from":1753.08,"to":1757.55,"location":2,"content":"And, um, that does mean you have kind of n-squared many systems in the number of,"},{"from":1757.55,"to":1758.91,"location":2,"content":"uh, languages you're considering."},{"from":1758.91,"to":1762.49,"location":2,"content":"So, yeah, that's actually a valid idea and this is something that people have researched."},{"from":1762.49,"to":1764.64,"location":2,"content":"The idea that maybe you could have a kind of mix"},{"from":1764.64,"to":1766.8,"location":2,"content":"and match with your encoders and decoders,"},{"from":1766.8,"to":1768.29,"location":2,"content":"and you could try to, uh,"},{"from":1768.29,"to":1770.43,"location":2,"content":"train a kind of general purpose, let's say,"},{"from":1770.43,"to":1774.35,"location":2,"content":"English decoder and then match it up with your different encoders."},{"from":1774.35,"to":1775.77,"location":2,"content":"Uh, but this is, I think,"},{"from":1775.77,"to":1778.93,"location":2,"content":"fairly complex to train and to- to make sure that they all work together."},{"from":1778.93,"to":1781.28,"location":2,"content":"But that- that is certainly something that people have done."},{"from":1781.28,"to":1783.86,"location":2,"content":"Let me just check on the time."},{"from":1783.86,"to":1787.34,"location":2,"content":"Oh, okay, let's take one more question. Yep."},{"from":1787.34,"to":1793.31,"location":2,"content":"So doesn't word embedding also come from the same corpus that we are training on?"},{"from":1793.31,"to":1795.99,"location":2,"content":"The question is, does the word embedding"},{"from":1795.99,"to":1797.92,"location":2,"content":"also come from the corpus that you're training on?"},{"from":1797.92,"to":1800.71,"location":2,"content":"So, I think, there's a few options just as we saw with language models."},{"from":1800.71,"to":1802.38,"location":2,"content":"You could download, uh,"},{"from":1802.38,"to":1806.27,"location":2,"content":"pre-trained word vectors like Word2Vec or GloVE and you could use those,"},{"from":1806.27,"to":1807.87,"location":2,"content":"and then you can either kind of freeze them,"},{"from":1807.87,"to":1810.81,"location":2,"content":"or you could fine tune them as part of the end-to-end training,"},{"from":1810.81,"to":1813.53,"location":2,"content":"or you could just initialize your word vectors as,"},{"from":1813.53,"to":1817.14,"location":2,"content":"uh, you know, close to zero random and then learn them from scratch."},{"from":1817.14,"to":1819.59,"location":2,"content":"All right, okay, moving on."},{"from":1819.59,"to":1822.39,"location":2,"content":"Uh, so now we understand how you would train"},{"from":1822.39,"to":1825.15,"location":2,"content":"a neural machine translation system and we talked"},{"from":1825.15,"to":1828.78,"location":2,"content":"briefly about how you might do decoding or generation."},{"from":1828.78,"to":1832.02,"location":2,"content":"So what I showed you before is something called, uh, greedy decoding,"},{"from":1832.02,"to":1835.11,"location":2,"content":"which is this idea that on each step you just choose the argmax,"},{"from":1835.11,"to":1836.4,"location":2,"content":"the top one best word,"},{"from":1836.4,"to":1838.75,"location":2,"content":"and then you feed that in on the next step."},{"from":1838.75,"to":1842.39,"location":2,"content":"So this is called greedy decoding because you're just taking the best- uh,"},{"from":1842.39,"to":1844.31,"location":2,"content":"the best option that you can see right"},{"from":1844.31,"to":1847.17,"location":2,"content":"now and then you don't really have a way to go back."},{"from":1847.17,"to":1850.84,"location":2,"content":"So can anyone see a problem with this method?"},{"from":1850.84,"to":1854.66,"location":2,"content":"Maybe I've kind of given it away, but, uh- Yeah."},{"from":1854.66,"to":1860.13,"location":2,"content":"Too expensive, too complex."},{"from":1860.13,"to":1861.63,"location":2,"content":"You said too expensive."},{"from":1861.63,"to":1865.08,"location":2,"content":"I guess, I mean, it is expensive in that you have to do a sequence,"},{"from":1865.08,"to":1867.43,"location":2,"content":"and a sequence is usually worse than something you can do in parallel,"},{"from":1867.43,"to":1868.68,"location":2,"content":"but I suppose, um, maybe,"},{"from":1868.68,"to":1870,"location":2,"content":"what's wrong with the greediness?"},{"from":1870,"to":1871.83,"location":2,"content":"Can anyone suggests what's wrong with the greediness? Yeah."},{"from":1871.83,"to":1876.1,"location":2,"content":"When we take argmax [inaudible]."},{"from":1876.1,"to":1878.97,"location":2,"content":"Yes, that's when you take an argmax on a token, it's"},{"from":1878.97,"to":1881.81,"location":2,"content":"not necessarily going to give you the argmax over the entire sentence."},{"from":1881.81,"to":1883.34,"location":2,"content":"That's exactly right, that's, uh,"},{"from":1883.34,"to":1886.13,"location":2,"content":"kind of, what, uh, what greediness means."},{"from":1886.13,"to":1889.08,"location":2,"content":"So in practice, this might give you something like this."},{"from":1889.08,"to":1893.16,"location":2,"content":"Uh, we're trying to translate our running example sentence and then, let's suppose,"},{"from":1893.16,"to":1895.32,"location":2,"content":"on the first step we say, he, and then we say,"},{"from":1895.32,"to":1896.98,"location":2,"content":"he hit, and then he- we say,"},{"from":1896.98,"to":1898.44,"location":2,"content":"he hit a, oh no."},{"from":1898.44,"to":1900.62,"location":2,"content":"That wasn't right, that wasn't the best thing to choose,"},{"from":1900.62,"to":1902.85,"location":2,"content":"but we kind of have no way to go back now, right?"},{"from":1902.85,"to":1906.27,"location":2,"content":"We just have to continue and try to make the best of it after saying he hit a,"},{"from":1906.27,"to":1908.73,"location":2,"content":"which, uh, isn't going to work out well."},{"from":1908.73,"to":1911.28,"location":2,"content":"So that's the main problem with greedy decoding; there's, kind of,"},{"from":1911.28,"to":1913.96,"location":2,"content":"no way to backtrack, no way to go back."},{"from":1913.96,"to":1916.45,"location":2,"content":"So, how can we fix this?"},{"from":1916.45,"to":1917.91,"location":2,"content":"And this relates back to, uh,"},{"from":1917.91,"to":1919.8,"location":2,"content":"what I told you earlier about how we might use,"},{"from":1919.8,"to":1924.32,"location":2,"content":"uh, a kind of searching algorithm to do decoding in SMT."},{"from":1924.32,"to":1927.8,"location":2,"content":"Uh, but first, you might,"},{"from":1927.8,"to":1929.98,"location":2,"content":"uh, think exhaustive search is a good idea."},{"from":1929.98,"to":1933.12,"location":2,"content":"Well, probably not because it's still a bad idea for the same reasons as before."},{"from":1933.12,"to":1935.76,"location":2,"content":"So, if you did want to do exhaustive search and search through"},{"from":1935.76,"to":1938.6,"location":2,"content":"the space of all possible French translations, uh,"},{"from":1938.6,"to":1939.9,"location":2,"content":"then you would be again,"},{"from":1939.9,"to":1943.02,"location":2,"content":"trying to consider which Y maximizes,"},{"from":1943.02,"to":1946.88,"location":2,"content":"uh, this product of all of these individual probability distributions."},{"from":1946.88,"to":1950.1,"location":2,"content":"So as before, if you tried to do this, uh,"},{"from":1950.1,"to":1954.16,"location":2,"content":"then on each step T of the decoder you're going to be having to track, uh,"},{"from":1954.16,"to":1957.72,"location":2,"content":"V to the power of t possible partial translations,"},{"from":1957.72,"to":1960.32,"location":2,"content":"uh, where V is your vocabulary size."},{"from":1960.32,"to":1963.06,"location":2,"content":"So here when I say partial translation, I just mean, uh,"},{"from":1963.06,"to":1966.91,"location":2,"content":"kind of, you know, like half of a sentence so far or something like that."},{"from":1966.91,"to":1968.85,"location":2,"content":"So of course this, uh,"},{"from":1968.85,"to":1972,"location":2,"content":"exponential in V complexity is just far too expensive."},{"from":1972,"to":1974.79,"location":2,"content":"So, yes, we're going to use some kind of search algorithm."},{"from":1974.79,"to":1976.1,"location":2,"content":"And in particular we're going to use,"},{"from":1976.1,"to":1977.95,"location":2,"content":"uh, Beam search decoding."},{"from":1977.95,"to":1982.47,"location":2,"content":"So the core idea of Beam search decoding is that on each step of"},{"from":1982.47,"to":1988.52,"location":2,"content":"the decoder you're going to be keeping track of the k most probable partial translations."},{"from":1988.52,"to":1990.97,"location":2,"content":"And we call partial translations hypotheses"},{"from":1990.97,"to":1993.08,"location":2,"content":"because we're kind of tracking multiple of them,"},{"from":1993.08,"to":1994.41,"location":2,"content":"and we're not sure which one is best,"},{"from":1994.41,"to":1996.56,"location":2,"content":"so we're thinking about several."},{"from":1996.56,"to":1998.54,"location":2,"content":"Here k is an"},{"from":1998.54,"to":2001.22,"location":2,"content":"integer and we call this the beam size."},{"from":2001.22,"to":2002.84,"location":2,"content":"And in practice, for an MT,"},{"from":2002.84,"to":2005.09,"location":2,"content":"this is usually maybe 5 to 10."},{"from":2005.09,"to":2007.84,"location":2,"content":"So, you can think of k kind of as,"},{"from":2007.84,"to":2010.34,"location":2,"content":"how big is your search space at any one time."},{"from":2010.34,"to":2013.19,"location":2,"content":"So if you increase k, then you're going to be considering, uh,"},{"from":2013.19,"to":2015.62,"location":2,"content":"more different options on each step and you might"},{"from":2015.62,"to":2018.41,"location":2,"content":"hope that this will mean that you get a better quality solution in the end,"},{"from":2018.41,"to":2021.01,"location":2,"content":"though of course it will be more expansive."},{"from":2021.01,"to":2024.02,"location":2,"content":"So, I said that we want to keep track of"},{"from":2024.02,"to":2027.85,"location":2,"content":"the k most probable partial translations, that is, hypotheses."},{"from":2027.85,"to":2030.21,"location":2,"content":"So this means that we need some kind of notion of, you know,"},{"from":2030.21,"to":2033.17,"location":2,"content":"how probable is this hypothesis or what's its score."},{"from":2033.17,"to":2036.05,"location":2,"content":"So, the score of the hypothesis, and uh,"},{"from":2036.05,"to":2039.88,"location":2,"content":"we're representing that as Y1 up to Yt,"},{"from":2039.88,"to":2042.98,"location":2,"content":"is just its log probability."},{"from":2042.98,"to":2046.91,"location":2,"content":"So, uh, the log probability of this partial translation,"},{"from":2046.91,"to":2048.49,"location":2,"content":"uh, according to the language model,"},{"from":2048.49,"to":2051.01,"location":2,"content":"can be broken down as we saw before into the sum"},{"from":2051.01,"to":2053.74,"location":2,"content":"of the individual log probabilities of the words,"},{"from":2053.74,"to":2056.92,"location":2,"content":"given everything that came before."},{"from":2056.92,"to":2059.38,"location":2,"content":"So it's- if it's not obvious,"},{"from":2059.38,"to":2061.03,"location":2,"content":"these scores are all negative,"},{"from":2061.03,"to":2062.09,"location":2,"content":"because we're taking log of,"},{"from":2062.09,"to":2064.07,"location":2,"content":"uh, of a number between zero and one."},{"from":2064.07,"to":2070.16,"location":2,"content":"Uh, and a higher score is better, yes,"},{"from":2070.16,"to":2073.46,"location":2,"content":"because you want a higher probability of, uh,"},{"from":2073.46,"to":2077.45,"location":2,"content":"of the hypothesis according to the language model."},{"from":2077.45,"to":2080.66,"location":2,"content":"So, the idea is that we're going to use this score, uh,"},{"from":2080.66,"to":2082.19,"location":2,"content":"and the search algorithm to search for"},{"from":2082.19,"to":2086.26,"location":2,"content":"high-scoring hypotheses and we're going to track the top k on each step."},{"from":2086.26,"to":2089.27,"location":2,"content":"So, I'll show- I'm going to show you a detailed example in a moment."},{"from":2089.27,"to":2091.49,"location":2,"content":"But the, uh, important things to note are"},{"from":2091.49,"to":2094.91,"location":2,"content":"that Beam search is not guaranteed to find an optimal solution."},{"from":2094.91,"to":2097.28,"location":2,"content":"Uh, exhaustive search, the one where you numate-"},{"from":2097.28,"to":2099.8,"location":2,"content":"enumerate all V to the T possible translations,"},{"from":2099.8,"to":2102.11,"location":2,"content":"that is guaranteed to find the optimal solution but it's just"},{"from":2102.11,"to":2104.96,"location":2,"content":"completely infeasible because it's so expe- expensive."},{"from":2104.96,"to":2108.05,"location":2,"content":"So Beam search is not guaranteed to find the optimal solution"},{"from":2108.05,"to":2111.86,"location":2,"content":"but it is a much more efficient and exhaustive search, of course."},{"from":2111.86,"to":2118.05,"location":2,"content":"All right, um, will- is the question- would the question be solved by seeing an example?"},{"from":2118.05,"to":2120.17,"location":2,"content":"Uh, I was just wondering, um,"},{"from":2120.17,"to":2122.36,"location":2,"content":"if you guys [NOISE] also find partial filtering or it gets-"},{"from":2122.36,"to":2125.8,"location":2,"content":"[inaudible] ."},{"from":2125.8,"to":2133.2,"location":2,"content":"Um, I'm not entirely sure about that."},{"from":2133.2,"to":2135.7,"location":2,"content":"I mean you certainly do some kind of sampling sometimes um,"},{"from":2135.7,"to":2137.5,"location":2,"content":"and that's something I'm gonna talk more about later in"},{"from":2137.5,"to":2139.39,"location":2,"content":"the natural language generation lecture."},{"from":2139.39,"to":2143.67,"location":2,"content":"Um, so I haven't heard of those being applied here."},{"from":2143.67,"to":2145.39,"location":2,"content":"Okay."},{"from":2145.39,"to":2149.54,"location":2,"content":"Okay. Um, so here's an example of beam search decoding in action."},{"from":2149.54,"to":2153.24,"location":2,"content":"Um, so let's suppose that beam size equals K, um, is 2."},{"from":2153.24,"to":2154.73,"location":2,"content":"And then as a reminder,"},{"from":2154.73,"to":2159.64,"location":2,"content":"we have- this is the school that you apply to a partial, um, hypothesis."},{"from":2159.64,"to":2162.59,"location":2,"content":"Um, partial translation which is a hypothesis."},{"from":2162.59,"to":2165.23,"location":2,"content":"So we start off with our starting token."},{"from":2165.23,"to":2167.24,"location":2,"content":"And the idea is that we're going to compute"},{"from":2167.24,"to":2170.42,"location":2,"content":"the probability distribution of what word might come next."},{"from":2170.42,"to":2174.53,"location":2,"content":"So, having computed that probability distribution using our seq2seq model,"},{"from":2174.53,"to":2176.09,"location":2,"content":"then we just take the top k,"},{"from":2176.09,"to":2178.1,"location":2,"content":"that is top two possible options."},{"from":2178.1,"to":2181.36,"location":2,"content":"So let's suppose that the top two are the words he and I."},{"from":2181.36,"to":2185.57,"location":2,"content":"So the idea is that we can compute the score of these two hypotheses,"},{"from":2185.57,"to":2188.21,"location":2,"content":"uh, by using the formula above."},{"from":2188.21,"to":2192.49,"location":2,"content":"It's just the log probability of this word given the context so far."},{"from":2192.49,"to":2197.8,"location":2,"content":"So here let's say that he has a score of minus 0.7 and I has a score of minus 0.9."},{"from":2197.8,"to":2200.39,"location":2,"content":"So this means that he is currently the better one."},{"from":2200.39,"to":2202.2,"location":2,"content":"Okay. So what we do is,"},{"from":2202.2,"to":2205.32,"location":2,"content":"we have our two, our k hypotheses."},{"from":2205.32,"to":2207.36,"location":2,"content":"And then for each of those,"},{"from":2207.36,"to":2210.8,"location":2,"content":"we find the top k words that could come next."},{"from":2210.8,"to":2212.66,"location":2,"content":"And we calculate their scores."},{"from":2212.66,"to":2214.47,"location":2,"content":"So this means that for both he and I,"},{"from":2214.47,"to":2216.71,"location":2,"content":"we find the top two words that could come next."},{"from":2216.71,"to":2218.84,"location":2,"content":"And for each of these four possibilities,"},{"from":2218.84,"to":2221.75,"location":2,"content":"um, the school of the hypothesis is equal to,"},{"from":2221.75,"to":2225.14,"location":2,"content":"um, the log probability of this new word given the context so far,"},{"from":2225.14,"to":2226.76,"location":2,"content":"plus the score so far."},{"from":2226.76,"to":2229.34,"location":2,"content":"Because you can accumulate this sum of log probability."},{"from":2229.34,"to":2232.75,"location":2,"content":"You didn't have to compute it from scratch each time."},{"from":2232.75,"to":2236.99,"location":2,"content":"So here you can see that we have these four possibilities and that"},{"from":2236.99,"to":2241.26,"location":2,"content":"the top two schools are minus 1.6 and minus 1.7."},{"from":2241.26,"to":2243.44,"location":2,"content":"So this means that hit and was,"},{"from":2243.44,"to":2244.84,"location":2,"content":"are the two best ones."},{"from":2244.84,"to":2248.22,"location":2,"content":"So the idea is that all of these k squared equals four hypotheses."},{"from":2248.22,"to":2251.45,"location":2,"content":"We're just gonna keep the k equals to top ones."},{"from":2251.45,"to":2253.49,"location":2,"content":"And then we just keep doing the same thing."},{"from":2253.49,"to":2256.63,"location":2,"content":"For these two, we expand to get the two next ones."},{"from":2256.63,"to":2258.95,"location":2,"content":"And then of those, we compute the scores,"},{"from":2258.95,"to":2262.74,"location":2,"content":"and then we keep the two best ones and discard the others."},{"from":2262.74,"to":2264.57,"location":2,"content":"And then of those, we expand."},{"from":2264.57,"to":2266.33,"location":2,"content":"So we keep doing this again and again."},{"from":2266.33,"to":2269.68,"location":2,"content":"Expanding and then just keeping the top k and expanding,"},{"from":2269.68,"to":2272.08,"location":2,"content":"like this until, um,"},{"from":2272.08,"to":2274.7,"location":2,"content":"you get some kind of, um, finished translation."},{"from":2274.7,"to":2278.22,"location":2,"content":"I'm gonna tell you more in a moment about what exactly the stopping criterion is."},{"from":2278.22,"to":2280.23,"location":2,"content":"But let's suppose that we stop here."},{"from":2280.23,"to":2283.78,"location":2,"content":"Looking at the four hypotheses that we have on the far right,"},{"from":2283.78,"to":2285.83,"location":2,"content":"the one with the top score is, um,"},{"from":2285.83,"to":2288.91,"location":2,"content":"the top pie one with minus 4.3."},{"from":2288.91,"to":2290.84,"location":2,"content":"So let's suppose that we are going to stop now and we"},{"from":2290.84,"to":2292.61,"location":2,"content":"decide that this is the top hypothesis,"},{"from":2292.61,"to":2295.01,"location":2,"content":"then all we need to do is just backtrack"},{"from":2295.01,"to":2297.71,"location":2,"content":"through this tree in order to find the full translation,"},{"from":2297.71,"to":2300.43,"location":2,"content":"which is, he hit me with a pie."},{"from":2300.43,"to":2303.38,"location":2,"content":"All right. So, um, let me tell you more detail"},{"from":2303.38,"to":2305.82,"location":2,"content":"about how exactly we decide when to stop."},{"from":2305.82,"to":2308.2,"location":2,"content":"So if you remember in greedy decoding,"},{"from":2308.2,"to":2312.01,"location":2,"content":"usually we just keep decoding until the model produces the end token."},{"from":2312.01,"to":2315.71,"location":2,"content":"So for example, this means that your model is actually producing the sequence."},{"from":2315.71,"to":2317.93,"location":2,"content":"Um, I guess it doesn't produce START and you give it START."},{"from":2317.93,"to":2319.32,"location":2,"content":"But then it produces the sequence,"},{"from":2319.32,"to":2321.92,"location":2,"content":"he hit me with a pie, end."},{"from":2321.92,"to":2324.95,"location":2,"content":"So the problem in Beam search decoding is"},{"from":2324.95,"to":2327.59,"location":2,"content":"that you're considering all these different hypotheses,"},{"from":2327.59,"to":2329.45,"location":2,"content":"k different hypotheses at once."},{"from":2329.45,"to":2333.91,"location":2,"content":"And the thing is, those hypotheses might produce n tokens at different times."},{"from":2333.91,"to":2336.76,"location":2,"content":"So there's no one obvious place to stop."},{"from":2336.76,"to":2338.96,"location":2,"content":"So what we do in practice is,"},{"from":2338.96,"to":2341.16,"location":2,"content":"when a hypothesis produces the END token,"},{"from":2341.16,"to":2345.57,"location":2,"content":"then we regard this hypothesis as complete and we kind of place it aside."},{"from":2345.57,"to":2347.84,"location":2,"content":"We have a collection of competed hypotheses."},{"from":2347.84,"to":2349.64,"location":2,"content":"Um, so we kind of take it out of Beam search."},{"from":2349.64,"to":2352.22,"location":2,"content":"We no longer keep exploring it, because it's finished."},{"from":2352.22,"to":2354.45,"location":2,"content":"Um, and we, yeah, place it aside."},{"from":2354.45,"to":2358.36,"location":2,"content":"And you continue exploring other hypotheses with Beam search."},{"from":2358.36,"to":2360.02,"location":2,"content":"So the remaining question is,"},{"from":2360.02,"to":2361.76,"location":2,"content":"when do you stop doing Beam search?"},{"from":2361.76,"to":2364.19,"location":2,"content":"When do you stop iterating through this algorithm?"},{"from":2364.19,"to":2367.74,"location":2,"content":"So there's, um, multiple possible stopping criterions,"},{"from":2367.74,"to":2370.53,"location":2,"content":"but two common ones are, you might say, um,"},{"from":2370.53,"to":2373.79,"location":2,"content":"\"We're gonna stop doing Beam search once we reach time step t,"},{"from":2373.79,"to":2375.18,"location":2,"content":"where t is some, um,"},{"from":2375.18,"to":2376.79,"location":2,"content":"predefined threshold that you choose.\""},{"from":2376.79,"to":2380.27,"location":2,"content":"So you might say, um, \"We're gonna stop Beam search after 30 steps,"},{"from":2380.27,"to":2384.4,"location":2,"content":"because we don't want any output sentences that are longer than 30 words, for example."},{"from":2384.4,"to":2385.74,"location":2,"content":"Or you might say, um,"},{"from":2385.74,"to":2387.55,"location":2,"content":"\"We're gonna stop doing Beam search once we've"},{"from":2387.55,"to":2390.04,"location":2,"content":"collected at least n completed hypotheses.\""},{"from":2390.04,"to":2391.15,"location":2,"content":"So you might say, um,"},{"from":2391.15,"to":2397.18,"location":2,"content":"\"I want at least 10 complete translations before I stop doing Beam search.\""},{"from":2397.18,"to":2400.43,"location":2,"content":"Okay. So what's the final thing you have to do?"},{"from":2400.43,"to":2405.23,"location":2,"content":"Uh, we finished doing Beam search and we have this collection of completed hypotheses."},{"from":2405.23,"to":2407.16,"location":2,"content":"Um, we want to choose the top one."},{"from":2407.16,"to":2409.47,"location":2,"content":"Um, the one that we're going to use as our translation."},{"from":2409.47,"to":2413.99,"location":2,"content":"So, um, how do we select the top one that has the highest score?"},{"from":2413.99,"to":2416,"location":2,"content":"Um, you might think this is simple given that all of"},{"from":2416,"to":2418.52,"location":2,"content":"these hypotheses already have scores attached."},{"from":2418.52,"to":2420.07,"location":2,"content":"But if we just look at this, um,"},{"from":2420.07,"to":2424.79,"location":2,"content":"formula again, um, for what the score is of each hypothesis."},{"from":2424.79,"to":2427.79,"location":2,"content":"Um, can anyone see a problem with this?"},{"from":2427.79,"to":2430.59,"location":2,"content":"If we have our set of hypotheses and then"},{"from":2430.59,"to":2433.51,"location":2,"content":"we're choosing the top one based on the one that has the best score,"},{"from":2433.51,"to":2437.09,"location":2,"content":"can anyone see a problem? Yep?"},{"from":2437.09,"to":2438.83,"location":2,"content":"You choose the shortest one?"},{"from":2438.83,"to":2440.05,"location":2,"content":"Yes. So the answer was,"},{"from":2440.05,"to":2441.77,"location":2,"content":"you're gonna end up choosing the shortest one."},{"from":2441.77,"to":2446.84,"location":2,"content":"Um, the problem here is that longer hypotheses have lower scores in general."},{"from":2446.84,"to":2450.84,"location":2,"content":"Because you're, um, multiplying more probabilities and getting a smaller,"},{"from":2450.84,"to":2452.16,"location":2,"content":"a smaller overall value."},{"from":2452.16,"to":2453.83,"location":2,"content":"Or I guess if we're adding more to the t,"},{"from":2453.83,"to":2455.53,"location":2,"content":"we're gonna get more negative values."},{"from":2455.53,"to":2459.5,"location":2,"content":"So it's not quite that you'll definitely choose the shortest hypothesis,"},{"from":2459.5,"to":2463.09,"location":2,"content":"because you could overall have a lower score."},{"from":2463.09,"to":2466.19,"location":2,"content":"But there's definitely going to be a bias towards shorter translations."},{"from":2466.19,"to":2468.99,"location":2,"content":"Because they'll in general have lower scores."},{"from":2468.99,"to":2471.29,"location":2,"content":"So the way you can fix this is pretty simple,"},{"from":2471.29,"to":2472.91,"location":2,"content":"you just normalize by length."},{"from":2472.91,"to":2474.91,"location":2,"content":"So instead of using the score that we have above,"},{"from":2474.91,"to":2478.1,"location":2,"content":"you're going to use the score divided by [inaudible]"},{"from":2478.1,"to":2480.59,"location":2,"content":"and the length of the hypothesis."},{"from":2480.59,"to":2483.65,"location":2,"content":"And then you use this, just like the top one."},{"from":2483.65,"to":2493.59,"location":2,"content":"Any questions on this? Yeah."},{"from":2493.59,"to":2501.14,"location":2,"content":"Do we train with the end token, so that it's possible to [inaudible] [NOISE]."},{"from":2501.14,"to":2502.13,"location":2,"content":"[NOISE] I didn't quite hear, do we train with the end token?"},{"from":2502.13,"to":2506.36,"location":2,"content":"Yeah. Like we added a token [inaudible]."},{"from":2506.36,"to":2509.11,"location":2,"content":"Yeah. So you train with the end token, if that's your question."},{"from":2509.11,"to":2512.6,"location":2,"content":"Um, because the whole point is you're relying on your language model,"},{"from":2512.6,"to":2516.8,"location":2,"content":"your decoder to produce the end token in order to know when to stop."},{"from":2516.8,"to":2518,"location":2,"content":"So you need to train,"},{"from":2518,"to":2521.03,"location":2,"content":"it's produce the end token by giving it examples of training sentences"},{"from":2521.03,"to":2526.14,"location":2,"content":"with end tokens. Yeah."},{"from":2526.14,"to":2527.84,"location":2,"content":"Why don't we use this score,"},{"from":2527.84,"to":2529.68,"location":2,"content":"the one at the bottom of the screen during Beam search."},{"from":2529.68,"to":2531.32,"location":2,"content":"Great question. The question is,"},{"from":2531.32,"to":2532.93,"location":2,"content":"why don't we use this normalized score,"},{"from":2532.93,"to":2534.18,"location":2,"content":"the one at the bottom of the screen,"},{"from":2534.18,"to":2535.99,"location":2,"content":"during Beam search in the first place."},{"from":2535.99,"to":2537.84,"location":2,"content":"So the reason why that's not necessary,"},{"from":2537.84,"to":2539.48,"location":2,"content":"you could, but it's not necessary."},{"from":2539.48,"to":2541.46,"location":2,"content":"Is because during Beam search,"},{"from":2541.46,"to":2546.76,"location":2,"content":"we only ever compare the scores of hypotheses that have the same length, right?"},{"from":2546.76,"to":2548,"location":2,"content":"So on each of these steps."},{"from":2548,"to":2551.81,"location":2,"content":"So when we look at, let's say the top k squared and we want to choose which ones are"},{"from":2551.81,"to":2553.91,"location":2,"content":"the top k. We're comparing"},{"from":2553.91,"to":2556.91,"location":2,"content":"the scores of four different hypotheses that are of length one,"},{"from":2556.91,"to":2558.49,"location":2,"content":"two, three, four, five."},{"from":2558.49,"to":2562.79,"location":2,"content":"So, um, it's true that these scores are getting lower and lower."},{"from":2562.79,"to":2570.7,"location":2,"content":"But, in the same way, because they're all length five right now."},{"from":2570.7,"to":2574.21,"location":2,"content":"Okay. So, we now understand how you would train"},{"from":2574.21,"to":2575.95,"location":2,"content":"an NMT system and how would you,"},{"from":2575.95,"to":2578.83,"location":2,"content":"you would use your trained NMT system to"},{"from":2578.83,"to":2582.25,"location":2,"content":"generate your translations using let's say, Beam search."},{"from":2582.25,"to":2585.4,"location":2,"content":"So, let's, uh, take a step back and think about what are"},{"from":2585.4,"to":2588.79,"location":2,"content":"the overall advantages of NMT in comparison to SMT."},{"from":2588.79,"to":2593.65,"location":2,"content":"[NOISE] Uh, so, the first advantage is just better performance."},{"from":2593.65,"to":2598.33,"location":2,"content":"Uh, NMT systems tend to give better output than SMT systems in several ways."},{"from":2598.33,"to":2601.42,"location":2,"content":"One is that the output often tends to be more fluent."},{"from":2601.42,"to":2603.99,"location":2,"content":"Uh, this is probably because NMT, uh,"},{"from":2603.99,"to":2606.13,"location":2,"content":"this is probably because RNNs are particularly good at"},{"from":2606.13,"to":2608.65,"location":2,"content":"learning language models, as you learned last week."},{"from":2608.65,"to":2611.17,"location":2,"content":"Uh, another way that they're better is they're often use,"},{"from":2611.17,"to":2612.72,"location":2,"content":"uh, the context better."},{"from":2612.72,"to":2615.16,"location":2,"content":"That is, uh, they're better at conditioning on"},{"from":2615.16,"to":2618.58,"location":2,"content":"the source sentence and using that to change the output."},{"from":2618.58,"to":2621.7,"location":2,"content":"Another way they're better is they often, uh,"},{"from":2621.7,"to":2624.64,"location":2,"content":"are more able to generalize what they learn"},{"from":2624.64,"to":2626.45,"location":2,"content":"about phrases and how to translate them."},{"from":2626.45,"to":2629.59,"location":2,"content":"So, for example, if it sees an example of how to translate"},{"from":2629.59,"to":2631.66,"location":2,"content":"a certain source phrase and then later it"},{"from":2631.66,"to":2634.51,"location":2,"content":"sees a slightly different version of that source phrase,"},{"from":2634.51,"to":2638.39,"location":2,"content":"it's, uh, more able to generalize what it learned about the first phrase,"},{"from":2638.39,"to":2641.6,"location":2,"content":"than SMT system as well."},{"from":2641.6,"to":2644.6,"location":2,"content":"Another big advantage of NMT systems"},{"from":2644.6,"to":2646.23,"location":2,"content":"compared to SMT that we talked about"},{"from":2646.23,"to":2648.51,"location":2,"content":"before is that it's a single neural network"},{"from":2648.51,"to":2650.16,"location":2,"content":"that can be optimized end-to-end."},{"from":2650.16,"to":2652.66,"location":2,"content":"And the advantage here I suppose is primarily"},{"from":2652.66,"to":2655.26,"location":2,"content":"simplicity and convenience."},{"from":2655.26,"to":2659.91,"location":2,"content":"So, there's no sub-components that need to be individually optimized."},{"from":2659.91,"to":2664.69,"location":2,"content":"Another big advantage is that it requires much less human engineering efforts."},{"from":2664.69,"to":2666.64,"location":2,"content":"When I told you earlier about all the different things"},{"from":2666.64,"to":2668.64,"location":2,"content":"that people had to do to build, uh,"},{"from":2668.64,"to":2671.22,"location":2,"content":"big, uh, powerful SMT systems,"},{"from":2671.22,"to":2674.22,"location":2,"content":"uh, there's relatively less engineering effort for NMT."},{"from":2674.22,"to":2678.14,"location":2,"content":"NMT is certainly not easy but it's- it's less complicated than SMT."},{"from":2678.14,"to":2680.91,"location":2,"content":"In particular, there's no feature engineering."},{"from":2680.91,"to":2682.9,"location":2,"content":"You don't have to define what features"},{"from":2682.9,"to":2685.18,"location":2,"content":"of linguistic phenomena that you want to capture."},{"from":2685.18,"to":2688.28,"location":2,"content":"You can mostly just view it as a sequence of words although,"},{"from":2688.28,"to":2692.19,"location":2,"content":"uh, there are different views on that."},{"from":2692.19,"to":2695.59,"location":2,"content":"Uh, lastly, a great thing about NMT is that you can"},{"from":2695.59,"to":2698.66,"location":2,"content":"use pretty much the same method for all language pairs."},{"from":2698.66,"to":2700.03,"location":2,"content":"So, if you've, uh, you know,"},{"from":2700.03,"to":2701.95,"location":2,"content":"built your French to English translation system and"},{"from":2701.95,"to":2703.95,"location":2,"content":"now you want to build a Spanish to English one, uh,"},{"from":2703.95,"to":2707.03,"location":2,"content":"you can probably use basically the same architecture and the same method"},{"from":2707.03,"to":2711.54,"location":2,"content":"as long as you can go find a big enough parallel corpus of Spanish to English."},{"from":2711.54,"to":2715.72,"location":2,"content":"All right. So, what are the disadvantages of NMT, uh, remaining?"},{"from":2715.72,"to":2717.22,"location":2,"content":"So, compared to SMT,"},{"from":2717.22,"to":2718.75,"location":2,"content":"there are some disadvantages."},{"from":2718.75,"to":2721.63,"location":2,"content":"One is that NMT is less interpretable."},{"from":2721.63,"to":2724.66,"location":2,"content":"Uh, what I mean by this is you feed in"},{"from":2724.66,"to":2728.98,"location":2,"content":"your source sentence into the neural network and then it feeds out some target sentence,"},{"from":2728.98,"to":2732.82,"location":2,"content":"and you don't really have any way to figure out why that happened."},{"from":2732.82,"to":2736.38,"location":2,"content":"Right? So, in particular, if the target sentence could contain some kind of error,"},{"from":2736.38,"to":2739.64,"location":2,"content":"um, you can't really look at the neurons and understand what happened."},{"from":2739.64,"to":2741.46,"location":2,"content":"It's pretty hard to attribute errors."},{"from":2741.46,"to":2742.87,"location":2,"content":"So, this means that, uh,"},{"from":2742.87,"to":2744.88,"location":2,"content":"NMT systems are pretty hard to debug."},{"from":2744.88,"to":2749.35,"location":2,"content":"So, by comparison, SMT systems were more interpretable,"},{"from":2749.35,"to":2752.23,"location":2,"content":"in that you had all of these different sub-components that were doing"},{"from":2752.23,"to":2755.26,"location":2,"content":"different jobs and you were more able to look at those."},{"from":2755.26,"to":2757.57,"location":2,"content":"They weren't, you know, neurons often would be, uh,"},{"from":2757.57,"to":2760.54,"location":2,"content":"you know, probabilities of certain words given other words and so on."},{"from":2760.54,"to":2762.73,"location":2,"content":"And, you know, that's by no means easy to"},{"from":2762.73,"to":2765.94,"location":2,"content":"interpret but it was at least more interpretable than NMT."},{"from":2765.94,"to":2770.99,"location":2,"content":"Uh, another disadvantage is NMT is pretty difficult to control."},{"from":2770.99,"to":2772.87,"location":2,"content":"So, uh, for example,"},{"from":2772.87,"to":2775.15,"location":2,"content":"if your NMT system is,"},{"from":2775.15,"to":2776.68,"location":2,"content":"uh, doing a particular error,"},{"from":2776.68,"to":2779.92,"location":2,"content":"it's not very easy for you the programmer to"},{"from":2779.92,"to":2783.94,"location":2,"content":"specify some kind of rule or guideline that you want the NMT system to follow."},{"from":2783.94,"to":2785.57,"location":2,"content":"So, for example if you want to say,"},{"from":2785.57,"to":2789.03,"location":2,"content":"\"I want to always translate this word in this way,"},{"from":2789.03,"to":2791.95,"location":2,"content":"um, when- when this other thing is present.\""},{"from":2791.95,"to":2795.41,"location":2,"content":"Like that's not particularly easy to, uh,"},{"from":2795.41,"to":2798.28,"location":2,"content":"to impose as a rule on the NMT system, uh,"},{"from":2798.28,"to":2799.89,"location":2,"content":"because you can't, uh,"},{"from":2799.89,"to":2803.2,"location":2,"content":"easily control what it's doing on a step-by-step basis."},{"from":2803.2,"to":2804.76,"location":2,"content":"So, sometimes you have some kind of,"},{"from":2804.76,"to":2806.97,"location":2,"content":"uh, post-processing rules you might try to do."},{"from":2806.97,"to":2808.9,"location":2,"content":"But overall, you can't,"},{"from":2808.9,"to":2811.64,"location":2,"content":"it's- it's- it's harder than you'd expect to try to, um,"},{"from":2811.64,"to":2815.53,"location":2,"content":"impose a fairly simple rule."},{"from":2815.53,"to":2818.01,"location":2,"content":"[NOISE] So, this means that you have some kind of safety concerns in fact."},{"from":2818.01,"to":2819.25,"location":2,"content":"Because, uh, let's say, you know,"},{"from":2819.25,"to":2822.31,"location":2,"content":"you don't want your NMT system to say bad things, right?"},{"from":2822.31,"to":2824.41,"location":2,"content":"It's- it's pretty hard to actually put, um,"},{"from":2824.41,"to":2826.84,"location":2,"content":"these, uh, controls in"},{"from":2826.84,"to":2829.39,"location":2,"content":"place to stop it from saying these things you don't want it to say."},{"from":2829.39,"to":2832.3,"location":2,"content":"I mean, on the level of maybe just never saying particular bad words,"},{"from":2832.3,"to":2834.47,"location":2,"content":"then sure, you can remove them from the vocabulary."},{"from":2834.47,"to":2836.66,"location":2,"content":"But overall, they're pretty hard to control,"},{"from":2836.66,"to":2840.3,"location":2,"content":"and we're actually gonna see some examples of NMT systems being,"},{"from":2840.3,"to":2843.28,"location":2,"content":"you know, doing things that their designer certainly didn't intend."},{"from":2843.28,"to":2846.49,"location":2,"content":"[NOISE] Okay."},{"from":2846.49,"to":2849.88,"location":2,"content":"So, uh, how do we evaluate MT?"},{"from":2849.88,"to":2853.75,"location":2,"content":"Uh, every good NLP task needs to have an automatic metric so that we can,"},{"from":2853.75,"to":2855.05,"location":2,"content":"uh, measure our progress."},{"from":2855.05,"to":2859.42,"location":2,"content":"So, the, uh, most commonly used evaluation metric for MT is called BLEU,"},{"from":2859.42,"to":2862.9,"location":2,"content":"and that stands for Bilingual Evaluation Understudy."},{"from":2862.9,"to":2865.75,"location":2,"content":"So, the main idea is that BLEU is gonna"},{"from":2865.75,"to":2870.24,"location":2,"content":"compare the translation that's produced by your machine translation system."},{"from":2870.24,"to":2871.81,"location":2,"content":"It's going to compare that to"},{"from":2871.81,"to":2875.53,"location":2,"content":"one or maybe several human written translations of the same sentence,"},{"from":2875.53,"to":2880.34,"location":2,"content":"and then it's gonna compute a similarity score that's based on n-gram precision."},{"from":2880.34,"to":2882.19,"location":2,"content":"So, when I say n-gram precision,"},{"from":2882.19,"to":2884.41,"location":2,"content":"I mean you're gonna look at all the one, two, three,"},{"from":2884.41,"to":2886.81,"location":2,"content":"and four grams that appear in your, uh,"},{"from":2886.81,"to":2889.99,"location":2,"content":"machine written translation and your human written translation."},{"from":2889.99,"to":2892.51,"location":2,"content":"And then n-gram precision is basically saying,"},{"from":2892.51,"to":2895.55,"location":2,"content":"for all of the n-grams that appeared in the machine written translation,"},{"from":2895.55,"to":2897.58,"location":2,"content":"how many of those appeared in, you know,"},{"from":2897.58,"to":2901.11,"location":2,"content":"at least one of the human written translations?"},{"from":2901.11,"to":2905.83,"location":2,"content":"Another thing that you need to add to BLEU is a brevity penalty."},{"from":2905.83,"to":2908.71,"location":2,"content":"Uh, so, you're saying that you get a lower BLEU score if"},{"from":2908.71,"to":2911.05,"location":2,"content":"your system translation is significantly shorter"},{"from":2911.05,"to":2913.57,"location":2,"content":"than all of the human written translations."},{"from":2913.57,"to":2916.27,"location":2,"content":"And the reason why you need to add this is because"},{"from":2916.27,"to":2920.39,"location":2,"content":"n-gram precision alone doesn't really punish using, uh, fewer words."},{"from":2920.39,"to":2925.3,"location":2,"content":"So, you might try to maximize n-gram precision by being very conservative and writing,"},{"from":2925.3,"to":2928.47,"location":2,"content":"uh, short sentences that only contain words that you're really sure about,"},{"from":2928.47,"to":2930.07,"location":2,"content":"and then you get a good precision score."},{"from":2930.07,"to":2932.89,"location":2,"content":"But this doesn't make a good translation because you're probably missing a bunch of"},{"from":2932.89,"to":2935.78,"location":2,"content":"information that you needed to translate from the source sentence."},{"from":2935.78,"to":2938.97,"location":2,"content":"So, that's why you need to add a brevity, uh, penalty."},{"from":2938.97,"to":2943.45,"location":2,"content":"So, overall, um, BLEU is very useful because,"},{"from":2943.45,"to":2946.84,"location":2,"content":"uh, we need an automatic metric in order to, uh, measure progress."},{"from":2946.84,"to":2948.82,"location":2,"content":"You can't measure progress on human evaluation"},{"from":2948.82,"to":2951.34,"location":2,"content":"alone because it takes too long to compute."},{"from":2951.34,"to":2954.11,"location":2,"content":"Um, but of course, it's pretty imperfect."},{"from":2954.11,"to":2957.86,"location":2,"content":"So, for example, you can think about how there are many ways,"},{"from":2957.86,"to":2959.83,"location":2,"content":"many valid ways to translate a sentence."},{"from":2959.83,"to":2961.36,"location":2,"content":"At the very beginning of this lecture,"},{"from":2961.36,"to":2963.66,"location":2,"content":"I asked how do we translate that sentence, uh,"},{"from":2963.66,"to":2966.43,"location":2,"content":"by Rousseau, and there were at least a few different options that came up."},{"from":2966.43,"to":2970.41,"location":2,"content":"Uh, so, if there's many valid ways to translate a sentence,"},{"from":2970.41,"to":2972.24,"location":2,"content":"how does BLEU recognize that?"},{"from":2972.24,"to":2976.43,"location":2,"content":"BLEU is rewarding sentences that have a high n-gram overlap with,"},{"from":2976.43,"to":2979.95,"location":2,"content":"uh, one or some of the human written translations."},{"from":2979.95,"to":2983.74,"location":2,"content":"But if, uh, you write one- if your model writes one valid translation and"},{"from":2983.74,"to":2987.67,"location":2,"content":"the humans write a different valid translation and they don't have high n-gram overlap,"},{"from":2987.67,"to":2989.18,"location":2,"content":"then BLEU is going to,"},{"from":2989.18,"to":2991.2,"location":2,"content":"uh, give you a low score."},{"from":2991.2,"to":2996.09,"location":2,"content":"So, um, you're going to learn about BLEU in detail in assignment four,"},{"from":2996.09,"to":2998.77,"location":2,"content":"and in fact assignment four has a full description,"},{"from":2998.77,"to":3000.91,"location":2,"content":"mathematical description of what the BLEU score is."},{"from":3000.91,"to":3004.05,"location":2,"content":"So, I'm not gonna tell you about that now. Uh, yeah."},{"from":3004.05,"to":3010.8,"location":2,"content":"So, you're gonna think about BLEU and the ways in which it's imperfect but useful. Yeah."},{"from":3010.8,"to":3015.97,"location":2,"content":"So would one n-gram be a one-to-one equivalency?"},{"from":3015.97,"to":3016.99,"location":2,"content":"Sorry."},{"from":3016.99,"to":3020.09,"location":2,"content":"Wouldn't one n-gram be a one-to-one equivalency?"},{"from":3020.09,"to":3023.91,"location":2,"content":"The question is would a one n-gram be a one-to-one equivalency."},{"from":3023.91,"to":3027.01,"location":2,"content":"I'm not sure I answered the question. Are you asking about alignment or something else?"},{"from":3027.01,"to":3030.96,"location":2,"content":"Uh, just trying to get an idea of how they're doing n-gram checks."},{"from":3030.96,"to":3036.16,"location":2,"content":"Is it doing all n-gram permutations or is it doing like window size of one?"},{"from":3036.16,"to":3038.64,"location":2,"content":"Well, I guess one- one gram it doesn't"},{"from":3038.64,"to":3040.44,"location":2,"content":"make a difference because you can't permute a one gram."},{"from":3040.44,"to":3041.46,"location":2,"content":"Okay. So, you're asking, for example, for"},{"from":3041.46,"to":3043.65,"location":2,"content":"four grams are they checking, uh,"},{"from":3043.65,"to":3047.93,"location":2,"content":"whether this exact sequence of four appears, or any permutation of it, its exact sequences."},{"from":3047.93,"to":3052.32,"location":2,"content":"So, by definition, n-grams are sequences where the order matters."},{"from":3052.32,"to":3056.52,"location":2,"content":"Okay. All right."},{"from":3056.52,"to":3059.05,"location":2,"content":"So, uh, that's how you evaluate machine translation."},{"from":3059.05,"to":3061.11,"location":2,"content":"So, now you can understand this metric of how we"},{"from":3061.11,"to":3063.21,"location":2,"content":"evaluate our progress on machine translation."},{"from":3063.21,"to":3066.68,"location":2,"content":"Um, I can show you this graph and you might understand what it means."},{"from":3066.68,"to":3068.43,"location":2,"content":"So, this is a, uh,"},{"from":3068.43,"to":3074.52,"location":2,"content":"a bar plot which shows in a nutshell how NMT changed the machine translation,"},{"from":3074.52,"to":3076.76,"location":2,"content":"uh, landscape in just a few years."},{"from":3076.76,"to":3080.73,"location":2,"content":"So, in this plot we've got BLEU score is the y-axis."},{"from":3080.73,"to":3083.52,"location":2,"content":"Uh, and you have two different types of SMT,"},{"from":3083.52,"to":3086.2,"location":2,"content":"which is the red and the dark blue, uh, bar plots."},{"from":3086.2,"to":3087.82,"location":2,"content":"And what's happening is,"},{"from":3087.82,"to":3089.18,"location":2,"content":"uh, in 2015, uh,"},{"from":3089.18,"to":3094.29,"location":2,"content":"neural MT enters the scene for the first time and it isn't doi- doing as well as SMT,"},{"from":3094.29,"to":3097.39,"location":2,"content":"and then the next year it's suddenly outperforming SMT."},{"from":3097.39,"to":3100.88,"location":2,"content":"And here, these are BLEU scores on some particular fixed data set,"},{"from":3100.88,"to":3103.26,"location":2,"content":"like a shared task that many people were,"},{"from":3103.26,"to":3104.76,"location":2,"content":"uh, submitting systems for."},{"from":3104.76,"to":3108.03,"location":2,"content":"[NOISE]. So, the main thing to notice here is that"},{"from":3108.03,"to":3111.53,"location":2,"content":"the progress that was being made by SMT systems was, you know,"},{"from":3111.53,"to":3114.15,"location":2,"content":"a fairly gentle increase in BLEU year by year,"},{"from":3114.15,"to":3115.95,"location":2,"content":"and then in just one year,"},{"from":3115.95,"to":3119.7,"location":2,"content":"NMT arrives and is suddenly doing a much more rapid progress."},{"from":3119.7,"to":3123.45,"location":2,"content":"So, I think this justifies why the picture of the meteor maybe isn't too dramatic here."},{"from":3123.45,"to":3131.19,"location":2,"content":"[NOISE] So, you could in fact call NMT the biggest success story of NLP in deep learning."},{"from":3131.19,"to":3133.59,"location":2,"content":"Uh, because if you think about the history of this,"},{"from":3133.59,"to":3138.51,"location":2,"content":"NMT went from being a fringe research activity in 2014 to being actually"},{"from":3138.51,"to":3143.9,"location":2,"content":"the leading standard method for machine translation in the wild in 2016."},{"from":3143.9,"to":3147.53,"location":2,"content":"In particular, in 2014 the first seq2seq paper was published,"},{"from":3147.53,"to":3151.82,"location":2,"content":"and in 2016 Google Translate switches from SMT to NMT."},{"from":3151.82,"to":3155.83,"location":2,"content":"This is a pretty remarkable turn around for just two years."},{"from":3155.83,"to":3159.72,"location":2,"content":"So, this is amazing not just because it was a quick turnaround,"},{"from":3159.72,"to":3162.57,"location":2,"content":"but also if you think about the level of human effort involved."},{"from":3162.57,"to":3164.3,"location":2,"content":"Uh, these SMT systems,"},{"from":3164.3,"to":3166.84,"location":2,"content":"for example the Google Translate SMT system,"},{"from":3166.84,"to":3170.61,"location":2,"content":"was built by doubtless hundreds of engineers over many years."},{"from":3170.61,"to":3176.52,"location":2,"content":"And this, uh, this SMT system was outperformed by an NMT system that was trained by,"},{"from":3176.52,"to":3179.91,"location":2,"content":"uh, you know, relatively few, like a handful of engineers, in a few months."},{"from":3179.91,"to":3182.34,"location":2,"content":"So, I'm not- I'm not diminishing how difficult it is to,"},{"from":3182.34,"to":3183.82,"location":2,"content":"um, build NMT systems,"},{"from":3183.82,"to":3186.33,"location":2,"content":"and certainly I'm sure Google's NMT system"},{"from":3186.33,"to":3189.41,"location":2,"content":"today is built by more than a handful of engineers in a few months."},{"from":3189.41,"to":3191.07,"location":2,"content":"I'm sure it's a very big operation now."},{"from":3191.07,"to":3193.17,"location":2,"content":"Uh, but when NcMT,"},{"from":3193.17,"to":3195.01,"location":2,"content":"uh, began to outperform SMT,"},{"from":3195.01,"to":3197.86,"location":2,"content":"it was pretty remarkable how it was able to do that,"},{"from":3197.86,"to":3201.2,"location":2,"content":"uh, based on the amount of effort involved. Yeah."},{"from":3201.2,"to":3205.88,"location":2,"content":"Given the still the same cons of NMT,"},{"from":3205.88,"to":3210.2,"location":2,"content":"has there been research to combine the two and if there is, what does that look like?"},{"from":3210.2,"to":3214.54,"location":2,"content":"[NOISE] Yeah, great, the question is, given that we noted that there are some disadvantages of NMT,"},{"from":3214.54,"to":3216.53,"location":2,"content":"even in comparison to SMT,"},{"from":3216.53,"to":3218.43,"location":2,"content":"is there any work on combining the two?"},{"from":3218.43,"to":3219.82,"location":2,"content":"So yes, I think there is."},{"from":3219.82,"to":3223.03,"location":2,"content":"Ah, there's a lot of NMT research ongoing, and in particular,"},{"from":3223.03,"to":3226.54,"location":2,"content":"people sometimes focus on these particular shortcomings and, ah,"},{"from":3226.54,"to":3231.04,"location":2,"content":"there's a lot of work in kind of taking techniques and ideas and wisdom from"},{"from":3231.04,"to":3233.59,"location":2,"content":"the many decades of SMT research and then integrating"},{"from":3233.59,"to":3236.23,"location":2,"content":"them into the new NMT paradigm, so yes."},{"from":3236.23,"to":3243.78,"location":2,"content":"[NOISE] Okay."},{"from":3243.78,"to":3247.93,"location":2,"content":"So, is machine translation solved?"},{"from":3247.93,"to":3250.81,"location":2,"content":"Can we all go home? I think the answer is clearly, no."},{"from":3250.81,"to":3254.72,"location":2,"content":"Ah, NMT definitely is not doing machine translation perfectly."},{"from":3254.72,"to":3259.15,"location":2,"content":"So, um, just to highlight some of the difficulties that remain with NMT."},{"from":3259.15,"to":3261.89,"location":2,"content":"One, is out of vocabulary words, um,"},{"from":3261.89,"to":3264.78,"location":2,"content":"this is the kind of basic problem but it- it's- it's pretty tricky."},{"from":3264.78,"to":3267.1,"location":2,"content":"You know, what do you do if you're trying to translate"},{"from":3267.1,"to":3270.26,"location":2,"content":"a sentence that contains a word that is not in your source vocabulary?"},{"from":3270.26,"to":3273.58,"location":2,"content":"Or, what if you're trying to produce a word that's not in your target vocabulary?"},{"from":3273.58,"to":3276.3,"location":2,"content":"Um, there's certainly been lots of work on doing this,"},{"from":3276.3,"to":3280.15,"location":2,"content":"and you're going to hear later in the class how you might try to attack this with,"},{"from":3280.15,"to":3282.88,"location":2,"content":"for example, ah, sub-word modeling can make it easier."},{"from":3282.88,"to":3286.09,"location":2,"content":"Ah, but this is a rema- this is, ah, a significant problem."},{"from":3286.09,"to":3288.43,"location":2,"content":"Another one is domain mismatch."},{"from":3288.43,"to":3290.47,"location":2,"content":"So, let's suppose that you train your, uh,"},{"from":3290.47,"to":3292.81,"location":2,"content":"machine translation system on a bunch of fairly,"},{"from":3292.81,"to":3294.63,"location":2,"content":"ah, formal texts, like, let's say, ah,"},{"from":3294.63,"to":3296.47,"location":2,"content":"Wikipedia, or something like that,"},{"from":3296.47,"to":3300.04,"location":2,"content":"but then you try to deploy it to translate informal texts,"},{"from":3300.04,"to":3302.24,"location":2,"content":"like people chatting on Twitter or something."},{"from":3302.24,"to":3305.95,"location":2,"content":"Then often, you'll find that it doesn't perform very well on this different domain,"},{"from":3305.95,"to":3307.15,"location":2,"content":"because you've got a domain mismatch,"},{"from":3307.15,"to":3309.7,"location":2,"content":"ah, so that's quite a big problem."},{"from":3309.7,"to":3313.14,"location":2,"content":"Another one is maintaining context over longer text."},{"from":3313.14,"to":3315.79,"location":2,"content":"So, everything we've talked about so far has assumed that you were"},{"from":3315.79,"to":3318.45,"location":2,"content":"just translating a single sentence to a single sentence,"},{"from":3318.45,"to":3321.05,"location":2,"content":"and there's no other [NOISE] wider context."},{"from":3321.05,"to":3322.3,"location":2,"content":"Ah, but, you know,"},{"from":3322.3,"to":3324.73,"location":2,"content":"if you want to use a machine translation system to"},{"from":3324.73,"to":3327.79,"location":2,"content":"translate a whole news article or maybe even a book, then,"},{"from":3327.79,"to":3330.91,"location":2,"content":"ah, you're probably going to want to use the context that came in"},{"from":3330.91,"to":3336.11,"location":2,"content":"previous sentences in order to translate things correctly in the current sentence."},{"from":3336.11,"to":3338.71,"location":2,"content":"So, ah, this is an active area of research,"},{"from":3338.71,"to":3341.05,"location":2,"content":"how can you get an NMT system to condition on"},{"from":3341.05,"to":3345.84,"location":2,"content":"larger pieces of context without it becoming too expensive and so on."},{"from":3345.84,"to":3349.03,"location":2,"content":"Another difficulty is low resource language pairs."},{"from":3349.03,"to":3352.15,"location":2,"content":"Um, everything we've talked about so far has assumed that you"},{"from":3352.15,"to":3355.55,"location":2,"content":"have access to a very large parallel corpus, but what if you don't."},{"from":3355.55,"to":3357.73,"location":2,"content":"What if you're trying to translate to or from a language"},{"from":3357.73,"to":3360.07,"location":2,"content":"that has relatively little text available,"},{"from":3360.07,"to":3362.03,"location":2,"content":"um, online, for example."},{"from":3362.03,"to":3364.28,"location":2,"content":"So, that can be pretty difficult."},{"from":3364.28,"to":3367.84,"location":2,"content":"Here are a few examples of machine translation screwing up,"},{"from":3367.84,"to":3370.2,"location":2,"content":"ah, with, with specif- specific errors."},{"from":3370.2,"to":3375.85,"location":2,"content":"So, here's an example of how common sense is really difficult for NMT systems."},{"from":3375.85,"to":3378.28,"location":2,"content":"On the left, we have the English phrase, paper jam,"},{"from":3378.28,"to":3381.13,"location":2,"content":"which means when your printer gets jammed up with paper,"},{"from":3381.13,"to":3383.49,"location":2,"content":"and it's all, ah, tangled inside."},{"from":3383.49,"to":3384.91,"location":2,"content":"And then on the right,"},{"from":3384.91,"to":3387.61,"location":2,"content":"we have a very literal translation of that into Spanish,"},{"from":3387.61,"to":3389.05,"location":2,"content":"and it's essentially saying jam,"},{"from":3389.05,"to":3391.06,"location":2,"content":"edible jam made of paper,"},{"from":3391.06,"to":3393.86,"location":2,"content":"which clearly isn't the right interpretation."},{"from":3393.86,"to":3396.7,"location":2,"content":"So here, we have an NMT system that's just doing"},{"from":3396.7,"to":3399.78,"location":2,"content":"very literal translation and clearly doesn't have any notion of common sense."},{"from":3399.78,"to":3401.29,"location":2,"content":"You can't make jams in paper."},{"from":3401.29,"to":3404.18,"location":2,"content":"Ah, here's another example."},{"from":3404.18,"to":3406.99,"location":2,"content":"NMT can pick up biases in the training data."},{"from":3406.99,"to":3409,"location":2,"content":"We already talked about this at the,"},{"from":3409,"to":3410.98,"location":2,"content":"ah, the word embedding level,"},{"from":3410.98,"to":3412.54,"location":2,"content":"the representation of words, ah,"},{"from":3412.54,"to":3414.72,"location":2,"content":"but it can also be a problem at the,"},{"from":3414.72,"to":3416.82,"location":2,"content":"you know, the sentence level when you're translating things."},{"from":3416.82,"to":3418.81,"location":2,"content":"So, here in this example,"},{"from":3418.81,"to":3419.98,"location":2,"content":"ah, on the left,"},{"from":3419.98,"to":3423.16,"location":2,"content":"we have two sentences in Malay that roughly mean,"},{"from":3423.16,"to":3425.17,"location":2,"content":"ah, they work as a nurse,"},{"from":3425.17,"to":3426.74,"location":2,"content":"and they work as a programmer."},{"from":3426.74,"to":3428.2,"location":2,"content":"The point is, on the left there,"},{"from":3428.2,"to":3430.78,"location":2,"content":"is no information about gender in the pronouns,"},{"from":3430.78,"to":3431.93,"location":2,"content":"but when it gets, ah,"},{"from":3431.93,"to":3435.79,"location":2,"content":"translated to English, then we've suddenly got gender coming out of nowhere,"},{"from":3435.79,"to":3438.78,"location":2,"content":"she works as a nurse and he works as a programmer."},{"from":3438.78,"to":3441.1,"location":2,"content":"This is likely happening because in our training data,"},{"from":3441.1,"to":3444.74,"location":2,"content":"we had more examples of female nurses and male programmers."},{"from":3444.74,"to":3446.65,"location":2,"content":"So, you can understand why from,"},{"from":3446.65,"to":3448.03,"location":2,"content":"ah, a machine learning ah,"},{"from":3448.03,"to":3450.14,"location":2,"content":"maximizing the objective point of view,"},{"from":3450.14,"to":3452.65,"location":2,"content":"the, ah, English language model has learned to do that."},{"from":3452.65,"to":3455.17,"location":2,"content":"But the problem here is, this isn't good machine translation."},{"from":3455.17,"to":3458.02,"location":2,"content":"Ah, here, the system is making up"},{"from":3458.02,"to":3461.24,"location":2,"content":"information that was not present in the source sentence."},{"from":3461.24,"to":3463.42,"location":2,"content":"So, this is certainly an error that"},{"from":3463.42,"to":3466.45,"location":2,"content":"the machine translation shouldn't be doing because it's just simply inaccurate."},{"from":3466.45,"to":3470.24,"location":2,"content":"And even worse, it's propagating, ah, gender roles."},{"from":3470.24,"to":3473.49,"location":2,"content":"Here's another pretty weird example."},{"from":3473.49,"to":3482.31,"location":2,"content":"[LAUGHTER] What is happening here?"},{"from":3482.31,"to":3483.33,"location":2,"content":"Ah, on the left,"},{"from":3483.33,"to":3485.74,"location":2,"content":"[NOISE] we have a nonsense sentence,"},{"from":3485.74,"to":3488.51,"location":2,"content":"this is just kind of a syllable repeated,"},{"from":3488.51,"to":3490.9,"location":2,"content":"and we're supposedly tra- translating from Somali."},{"from":3490.9,"to":3493.74,"location":2,"content":"Ah, and then we're asking to translate this into English,"},{"from":3493.74,"to":3495.67,"location":2,"content":"and then we're getting this out of nowhere."},{"from":3495.67,"to":3499.34,"location":2,"content":"Um, as the name of the LORD was written in the Hebrew language,"},{"from":3499.34,"to":3501.36,"location":2,"content":"it was written in the language of the Hebrew nation."},{"from":3501.36,"to":3504.39,"location":2,"content":"And you might be thinking, \"Where on earth did that come from?\""},{"from":3504.39,"to":3506.99,"location":2,"content":"And, in fact, this got reported in the media as,"},{"from":3506.99,"to":3510.13,"location":2,"content":"you know, Google Translate wants to convert you to its religion or whatever."},{"from":3510.13,"to":3512.44,"location":2,"content":"[LAUGHTER]."},{"from":3512.44,"to":3513.61,"location":2,"content":"Um, so for sure,"},{"from":3513.61,"to":3514.86,"location":2,"content":"it is very startling,"},{"from":3514.86,"to":3518.53,"location":2,"content":"but the thing is there's actually quite a reasonable explanation."},{"from":3518.53,"to":3521.66,"location":2,"content":"So, what's going on here is that,"},{"from":3521.66,"to":3525.28,"location":2,"content":"um, often for low-resource languages such as,"},{"from":3525.28,"to":3532.07,"location":2,"content":"for example, Somali, one of the best resources of parallel text is the Bible."},{"from":3532.07,"to":3533.79,"location":2,"content":"So you train, for example,"},{"from":3533.79,"to":3538.26,"location":2,"content":"Somali's English using the Bible as a training text, maybe among other texts."},{"from":3538.26,"to":3540.19,"location":2,"content":"Okay, so that's the first puzzle piece,"},{"from":3540.19,"to":3543.1,"location":2,"content":"but the other- the other puzzle piece is the nonsensical input."},{"from":3543.1,"to":3547.69,"location":2,"content":"So, when the input isn't really Somali or any kind of text, right?"},{"from":3547.69,"to":3549.55,"location":2,"content":"It's just the same syllable over and over,"},{"from":3549.55,"to":3553.26,"location":2,"content":"then the NMT system doesn't really have anything sensible to condition on."},{"from":3553.26,"to":3555.25,"location":2,"content":"It's basically nonsense. It's just noise."},{"from":3555.25,"to":3557.22,"location":2,"content":"So what does the NMT system do?"},{"from":3557.22,"to":3558.67,"location":2,"content":"Right? It can't really use,"},{"from":3558.67,"to":3560.74,"location":2,"content":"it can't really condition on the, ah, source sentence."},{"from":3560.74,"to":3564.11,"location":2,"content":"So what it does, is it just uses the English language model, right?"},{"from":3564.11,"to":3567.1,"location":2,"content":"You can think of it as like the English language model of the decoder RNN,"},{"from":3567.1,"to":3570.49,"location":2,"content":"just kind of goes into autopilot and starts generating random text."},{"from":3570.49,"to":3573.05,"location":2,"content":"Kind of like we saw, ah, last week when we saw, ah,"},{"from":3573.05,"to":3575.02,"location":2,"content":"a language model trained on Obama speeches or"},{"from":3575.02,"to":3577.36,"location":2,"content":"Harry Potter would just generate text in that style."},{"from":3577.36,"to":3579.43,"location":2,"content":"That's kind of what's happening here with the Bible,"},{"from":3579.43,"to":3581.47,"location":2,"content":"because we don't have any useful information,"},{"from":3581.47,"to":3583.99,"location":2,"content":"um, from the sentence on the left."},{"from":3583.99,"to":3588.2,"location":2,"content":"Um, so this is an example why, ah,"},{"from":3588.2,"to":3591.39,"location":2,"content":"neural machine translation in particular makes these kinds of errors ah,"},{"from":3591.39,"to":3593.47,"location":2,"content":"because the system is uninterpretable."},{"from":3593.47,"to":3596.32,"location":2,"content":"So, you don't know that this is going to happen until it happens,"},{"from":3596.32,"to":3598.09,"location":2,"content":"and perhaps Google didn't know this was gonna"},{"from":3598.09,"to":3600.31,"location":2,"content":"happen until it happened, and it got reported."},{"from":3600.31,"to":3603.43,"location":2,"content":"Um, so this is one downside of uninterpretability,"},{"from":3603.43,"to":3605.62,"location":2,"content":"is that really weird effects can happen,"},{"from":3605.62,"to":3606.86,"location":2,"content":"and you don't see them coming,"},{"from":3606.86,"to":3609.79,"location":2,"content":"and it's not always even easy to explain why they happened. Yeah? [NOISE]."},{"from":3609.79,"to":3615.82,"location":2,"content":"Looks like it says Irish, [inaudible] can translate that back to Irish?"},{"from":3615.82,"to":3620.03,"location":2,"content":"Ah, the question is what happens if you did translate from Irish?"},{"from":3620.03,"to":3623.11,"location":2,"content":"I suppose that's the part where Google tries to auto-detect the language,"},{"from":3623.11,"to":3626.17,"location":2,"content":"maybe it thinks that ag, ag, ag is more like Irish than Somali [LAUGHTER]."},{"from":3626.17,"to":3629.17,"location":2,"content":"Um, I imagine if you did put Irish to English,"},{"from":3629.17,"to":3631.3,"location":2,"content":"there's probably more, ah,"},{"from":3631.3,"to":3632.8,"location":2,"content":"training data for Irish English,"},{"from":3632.8,"to":3635.38,"location":2,"content":"so maybe it wouldn't be so Bible focused."},{"from":3635.38,"to":3638.32,"location":2,"content":"Um, yeah, and there's a lot of examples of these online where"},{"from":3638.32,"to":3642.57,"location":2,"content":"you do different kinds of nonsense syllables in different languages."},{"from":3642.57,"to":3644.66,"location":2,"content":"So, there's a lot of, ah,"},{"from":3644.66,"to":3647.41,"location":2,"content":"challenges remaining in NMT and,"},{"from":3647.41,"to":3649.01,"location":2,"content":"ah, the research continues."},{"from":3649.01,"to":3653.95,"location":2,"content":"So, NMT, I think remains one of the flagship tasks for NLP Deep Learning."},{"from":3653.95,"to":3657.07,"location":2,"content":"And, in fact, NMT research has pioneered many of"},{"from":3657.07,"to":3659.8,"location":2,"content":"the successful innovations of NLP Deep Learning in general."},{"from":3659.8,"to":3662.53,"location":2,"content":"Ah, so today in 2019,"},{"from":3662.53,"to":3664.78,"location":2,"content":"ah, NMT research continues to thrive."},{"from":3664.78,"to":3666.55,"location":2,"content":"There are still many, many papers, ah,"},{"from":3666.55,"to":3669.1,"location":2,"content":"published all the time on NMT, and in fact,"},{"from":3669.1,"to":3671.56,"location":2,"content":"researchers have found lots of improvements to"},{"from":3671.56,"to":3674.35,"location":2,"content":"the fairly \"vanilla\" seq2seq models that I've shown you today."},{"from":3674.35,"to":3675.93,"location":2,"content":"Ah, but in fact,"},{"from":3675.93,"to":3677.8,"location":2,"content":"there's one improvement that is so"},{"from":3677.8,"to":3681.3,"location":2,"content":"integral to seq2seq that you could regard it as the new \"vanilla\"."},{"from":3681.3,"to":3683.43,"location":2,"content":"And that's the improvement we're going to learn about today,"},{"from":3683.43,"to":3685.7,"location":2,"content":"and it's called attention."},{"from":3685.7,"to":3690.89,"location":2,"content":"Okay. So, section three is on attention. What is attention?"},{"from":3690.89,"to":3694.54,"location":2,"content":"First, I'm going to motivate why we need this thing called attention."},{"from":3694.54,"to":3698.01,"location":2,"content":"So, let's look at this diagram that we saw before of sequence-to-sequence."},{"from":3698.01,"to":3700.36,"location":2,"content":"And remember, when we assumed that this, ah,"},{"from":3700.36,"to":3702.79,"location":2,"content":"encoding of the source sentence th- th- the one in"},{"from":3702.79,"to":3705.76,"location":2,"content":"the orange box is going to represent the whole sentence."},{"from":3705.76,"to":3711.87,"location":2,"content":"Ah, can anyone volunteer a problem you can see with this architecture?"},{"from":3711.87,"to":3715.24,"location":2,"content":"In particular, perhaps a problem with this idea that"},{"from":3715.24,"to":3717.7,"location":2,"content":"that single vector is the encoding of the source sentence."},{"from":3717.7,"to":3720.91,"location":2,"content":"[NOISE] Yep."},{"from":3720.91,"to":3723.1,"location":2,"content":"[inaudible] [NOISE]."},{"from":3723.1,"to":3725.14,"location":2,"content":"It doesn't- you are only looking at one word,"},{"from":3725.14,"to":3727.06,"location":2,"content":"and that word could have many meanings potentially."},{"from":3727.06,"to":3728.62,"location":2,"content":"You don't know what's going on,"},{"from":3728.62,"to":3729.67,"location":2,"content":"so you can't figure out which meaning."},{"from":3729.67,"to":3733.21,"location":2,"content":"Okay, so the answer is something like,"},{"from":3733.21,"to":3734.74,"location":2,"content":"um, you're only looking at one word."},{"from":3734.74,"to":3736.75,"location":2,"content":"You mean like the last word of the source sentence,"},{"from":3736.75,"to":3738.63,"location":2,"content":"and you're not seeing more information."},{"from":3738.63,"to":3740.35,"location":2,"content":"Yeah, som- it's, it's something like that."},{"from":3740.35,"to":3741.79,"location":2,"content":"Any other ideas? Yep."},{"from":3741.79,"to":3745.99,"location":2,"content":"You might have lost information from the beginning of the sentence by the time you get to the end."},{"from":3745.99,"to":3748.76,"location":2,"content":"Yeah. You might have lost information from the sentence"},{"from":3748.76,"to":3752.34,"location":2,"content":"by the time you get to the end, especially if it was longer than four words."},{"from":3752.34,"to":3754.38,"location":2,"content":"Right. I think these are different ways of saying, uh,"},{"from":3754.38,"to":3756.75,"location":2,"content":"a similar idea [NOISE],"},{"from":3756.75,"to":3759.39,"location":2,"content":"which is that we have a kind of informational bottleneck."},{"from":3759.39,"to":3763.26,"location":2,"content":"Uh, we're forcing all of the information about the source sentence to be captured"},{"from":3763.26,"to":3767.3,"location":2,"content":"in this single vector because that's the only thing that gets given to the decoder."},{"from":3767.3,"to":3769.93,"location":2,"content":"If some information about source sentence isn't in that vector,"},{"from":3769.93,"to":3773.29,"location":2,"content":"then there's no way the decoder is going to be able to translate it correctly."},{"from":3773.29,"to":3774.58,"location":2,"content":"So, this is, yeah."},{"from":3774.58,"to":3776.3,"location":2,"content":"This is an informational bottleneck."},{"from":3776.3,"to":3778.33,"location":2,"content":"It's putting, kind of, too much pressure on"},{"from":3778.33,"to":3781.84,"location":2,"content":"this single vector to be a good representation of the encoder."},{"from":3781.84,"to":3784.95,"location":2,"content":"So, this is the motivation for attention."},{"from":3784.95,"to":3789.09,"location":2,"content":"Attention is a neural technique and it provides a solution to the bottleneck problem."},{"from":3789.09,"to":3792.18,"location":2,"content":"The core idea is that on each step of the decoder,"},{"from":3792.18,"to":3794.17,"location":2,"content":"you're going to use a direct connection to"},{"from":3794.17,"to":3799.74,"location":2,"content":"the encoder to focus on a particular part of the source sequence."},{"from":3799.74,"to":3804.1,"location":2,"content":"So first, I'm going to show you what attention is via a diagram,"},{"from":3804.1,"to":3805.8,"location":2,"content":"so that's kind of an intuitive explanation,"},{"from":3805.8,"to":3808.05,"location":2,"content":"and then I'm going to show you the equations later."},{"from":3808.05,"to":3812.2,"location":2,"content":"So, here's how se- seq- sequence-to-sequence with attention works."},{"from":3812.2,"to":3814.6,"location":2,"content":"So, on the first step of our decoder,"},{"from":3814.6,"to":3817.74,"location":2,"content":"uh, we have our first decoder hidden state."},{"from":3817.74,"to":3820.98,"location":2,"content":"So, what we do is we take the dot product between"},{"from":3820.98,"to":3824.24,"location":2,"content":"that decoder hidden state and the first encoder hidden state,"},{"from":3824.24,"to":3826.86,"location":2,"content":"and then we get something called an attention score,"},{"from":3826.86,"to":3829.78,"location":2,"content":"which I'm representing by a dot, so that's a scalar."},{"from":3829.78,"to":3832.42,"location":2,"content":"And in fact, we take the dot product between the decoder"},{"from":3832.42,"to":3835.75,"location":2,"content":"hidden state and all of the encoder hidden states."},{"from":3835.75,"to":3838.06,"location":2,"content":"So, this means that we get one attention score,"},{"from":3838.06,"to":3840.2,"location":2,"content":"one scalar for each of these,"},{"from":3840.2,"to":3842.65,"location":2,"content":"uh, source words effectively."},{"from":3842.65,"to":3848.51,"location":2,"content":"So, next what we do is we take those four numbers scores and we apply the softmax,"},{"from":3848.51,"to":3851.22,"location":2,"content":"uh, distribution, the softmax function to them,"},{"from":3851.22,"to":3853.82,"location":2,"content":"and then we get a probability distribution."},{"from":3853.82,"to":3858.36,"location":2,"content":"So here, I'm going to represent that probability distribution as a bar chart,"},{"from":3858.36,"to":3860.93,"location":2,"content":"um, and we call this the attention distribution,"},{"from":3860.93,"to":3862.86,"location":2,"content":"and this one sums up to one."},{"from":3862.86,"to":3867.46,"location":2,"content":"So here, you can see that most of the probability mass is on the first word,"},{"from":3867.46,"to":3871,"location":2,"content":"and that kind of makes sense because our first words essentially means \"he\" and"},{"from":3871,"to":3875.05,"location":2,"content":"we're going to be producing the word \"he\" first in our target sentence."},{"from":3875.05,"to":3877.63,"location":2,"content":"So, once we've got this attention distribution,"},{"from":3877.63,"to":3883.95,"location":2,"content":"uh, we're going to use it to produce something called the attention output."},{"from":3883.95,"to":3886.78,"location":2,"content":"So, the idea here is that the attention output is"},{"from":3886.78,"to":3889.7,"location":2,"content":"a weighted sum of the encoder hidden states,"},{"from":3889.7,"to":3893.18,"location":2,"content":"and the waiting is the attention distribution."},{"from":3893.18,"to":3895.09,"location":2,"content":"So, I've got these dotted arrows that go from"},{"from":3895.09,"to":3897.05,"location":2,"content":"the attention distribution to the attention output."},{"from":3897.05,"to":3899.89,"location":2,"content":"Probably there should be dotted arrows also from the encoder RNN,"},{"from":3899.89,"to":3900.91,"location":2,"content":"but that's hard to depict."},{"from":3900.91,"to":3905.05,"location":2,"content":"[NOISE] But the idea is that you're summing up these encoder RNN, uh,"},{"from":3905.05,"to":3907.15,"location":2,"content":"hidden states, [NOISE] but you're going to weight each"},{"from":3907.15,"to":3910.2,"location":2,"content":"one according to how much attention distribution it has on it."},{"from":3910.2,"to":3914.29,"location":2,"content":"So, this means that your attention output which is a single vector is going to be"},{"from":3914.29,"to":3918.03,"location":2,"content":"mostly containing information from the hidden states that had high attention."},{"from":3918.03,"to":3924.87,"location":2,"content":"In this case, it's going to be mostly information from the first hidden state."},{"from":3924.87,"to":3927.05,"location":2,"content":"So, after you do this,"},{"from":3927.05,"to":3931.06,"location":2,"content":"you're going to use the attention output to influence your prediction of the next word."},{"from":3931.06,"to":3933.19,"location":2,"content":"So, what you usually do is you concatenate"},{"from":3933.19,"to":3935.49,"location":2,"content":"the attention output with your decoder hidden states,"},{"from":3935.49,"to":3938.26,"location":2,"content":"and then, uh, use that kind of concatenated pair"},{"from":3938.26,"to":3941.59,"location":2,"content":"in the way you would have used the decoder hidden state alone before."},{"from":3941.59,"to":3944.39,"location":2,"content":"So, that way you can get your probability distribution"},{"from":3944.39,"to":3946.86,"location":2,"content":"y hat one of what's coming next."},{"from":3946.86,"to":3949.84,"location":2,"content":"So, as before, we can use that to sample your next word."},{"from":3949.84,"to":3952.03,"location":2,"content":"[NOISE] So, on the next step,"},{"from":3952.03,"to":3953.39,"location":2,"content":"you just do the same thing again."},{"from":3953.39,"to":3955.75,"location":2,"content":"You've got your second decoder hidden state, again,"},{"from":3955.75,"to":3958.14,"location":2,"content":"you take dot product with all of the encoder hidden states,"},{"from":3958.14,"to":3960.89,"location":2,"content":"you take softmax layer without getting attention distribution,"},{"from":3960.89,"to":3963.34,"location":2,"content":"and here you can see the attention distribution is different."},{"from":3963.34,"to":3967.33,"location":2,"content":"We're putting more attention on, uh, the word entarte"},{"from":3967.33,"to":3969.19,"location":2,"content":"because we're about to produce the word hits."},{"from":3969.19,"to":3971.88,"location":2,"content":"Uh, but we're also attending a little bit to the second one,"},{"from":3971.88,"to":3974.95,"location":2,"content":"ah, because that's telling us that hit is past tense."},{"from":3974.95,"to":3979.15,"location":2,"content":"So, a cool thing that's happening here is we're getting a soft alignment."},{"from":3979.15,"to":3983.08,"location":2,"content":"If you remember when we looked at alignment in SMT systems, it was mostly this,"},{"from":3983.08,"to":3984.58,"location":2,"content":"uh, hard binary thing,"},{"from":3984.58,"to":3987.25,"location":2,"content":"it was on or off, either these words are aligned or they're not."},{"from":3987.25,"to":3991.14,"location":2,"content":"Here, you have a much more flexible soft notion of alignment,"},{"from":3991.14,"to":3992.77,"location":2,"content":"where, uh, each word, kind of,"},{"from":3992.77,"to":3996.79,"location":2,"content":"has a distribution over the corresponding words in the source sentence."},{"from":3996.79,"to":3998.5,"location":2,"content":"So, another thing to note,"},{"from":3998.5,"to":4001.08,"location":2,"content":"kind of a side note, is that sometimes, uh,"},{"from":4001.08,"to":4004.56,"location":2,"content":"we take the attention output from the previous hidden state, uh,"},{"from":4004.56,"to":4009.16,"location":2,"content":"and we kind of feed it into the decoder again along with the usual word."},{"from":4009.16,"to":4011.13,"location":2,"content":"So, that would mean you take the attention output from"},{"from":4011.13,"to":4014.22,"location":2,"content":"the first step and kind of concatenate it to the word vector for \"he\","},{"from":4014.22,"to":4016.08,"location":2,"content":"and then use it in the decoder."},{"from":4016.08,"to":4019.17,"location":2,"content":"The reason for this is sometimes is useful to have this, uh,"},{"from":4019.17,"to":4023.41,"location":2,"content":"information from the- the attention on the previous step, on the next step."},{"from":4023.41,"to":4025.47,"location":2,"content":"So, I'm telling you this because this is something we do in"},{"from":4025.47,"to":4027.39,"location":2,"content":"assignment four and it's a fairly common technique,"},{"from":4027.39,"to":4029.8,"location":2,"content":"but also sometimes people don't do it."},{"from":4029.8,"to":4034.83,"location":2,"content":"Okay. So, um, the idea is that you just do this attention,"},{"from":4034.83,"to":4036.91,"location":2,"content":"uh, computation on every step,"},{"from":4036.91,"to":4039.48,"location":2,"content":"and on each step you're going to be tending to different things."},{"from":4039.48,"to":4040.62,"location":2,"content":"So, in our example,"},{"from":4040.62,"to":4043.11,"location":2,"content":"on this third step we look at an apostrophe,"},{"from":4043.11,"to":4045.18,"location":2,"content":"which means me, when we produce me,"},{"from":4045.18,"to":4047.55,"location":2,"content":"and then on the last three were probably mostly just going to be looking at"},{"from":4047.55,"to":4050.12,"location":2,"content":"this fertile word entarte,"},{"from":4050.12,"to":4051.21,"location":2,"content":"to produce hit me with a pie."},{"from":4051.21,"to":4054.24,"location":2,"content":"[NOISE] I'm going to keep going because we don't have a lot of time."},{"from":4054.24,"to":4057.48,"location":2,"content":"Uh, so, here are the equations to describe attention."},{"from":4057.48,"to":4060.06,"location":2,"content":"I think it's probably easier to look at these in your own time later,"},{"from":4060.06,"to":4061.7,"location":2,"content":"rather than look at them in the lecture now,"},{"from":4061.7,"to":4063.51,"location":2,"content":"but these are the equations that essentially"},{"from":4063.51,"to":4065.98,"location":2,"content":"say the same thing as what the diagram just said."},{"from":4065.98,"to":4069.76,"location":2,"content":"So, you have your encoder hidden states h_1 up to h_N,"},{"from":4069.76,"to":4072.45,"location":2,"content":"and then on timestep t of the decoder,"},{"from":4072.45,"to":4075.51,"location":2,"content":"we also have a decoder hidden state, s_t."},{"from":4075.51,"to":4078.95,"location":2,"content":"So, you're going to get the attention scores, which we are going to call e_t,"},{"from":4078.95,"to":4080.85,"location":2,"content":"by taking the dot product of"},{"from":4080.85,"to":4083.37,"location":2,"content":"your decoder hidden state with each of the encoder hidden states."},{"from":4083.37,"to":4085.14,"location":2,"content":"[NOISE] And that gives you, uh,"},{"from":4085.14,"to":4086.94,"location":2,"content":"a vector of same length as"},{"from":4086.94,"to":4092.03,"location":2,"content":"the encoder sentence because you've got one score per source word."},{"from":4092.03,"to":4095.34,"location":2,"content":"Next, you take softmax over these scores to"},{"from":4095.34,"to":4097.94,"location":2,"content":"get attention distribution that sums up to one,"},{"from":4097.94,"to":4100.27,"location":2,"content":"and we call that Alpha."},{"from":4100.27,"to":4105.05,"location":2,"content":"And then you use Alpha to take a weighted sum of the encoder hidden states,"},{"from":4105.05,"to":4107.19,"location":2,"content":"and that gives you your attention outputs."},{"from":4107.19,"to":4110.13,"location":2,"content":"So, the attention output which we call a is a vector that's"},{"from":4110.13,"to":4114.3,"location":2,"content":"the same size as your encoder hidden states."},{"from":4114.3,"to":4118.08,"location":2,"content":"Lastly, you take your attention output a,"},{"from":4118.08,"to":4119.43,"location":2,"content":"and then you, uh,"},{"from":4119.43,"to":4121.59,"location":2,"content":"concatenate it with your decoder hidden state,"},{"from":4121.59,"to":4127.22,"location":2,"content":"and then proceed with that a s you were taught before in the no-attention model."},{"from":4127.22,"to":4130.41,"location":2,"content":"So, attention, if it's not clear, it's pretty cool."},{"from":4130.41,"to":4132.16,"location":2,"content":"It has a number of advantages."},{"from":4132.16,"to":4136.59,"location":2,"content":"So, one advantage is that attention just significantly improves NMT performance."},{"from":4136.59,"to":4140.49,"location":2,"content":"And the main reason why it improves it is because it turns out it's super useful to allow"},{"from":4140.49,"to":4144.87,"location":2,"content":"the decoder to focus on certain parts of the source sentence when it's translating."},{"from":4144.87,"to":4146.61,"location":2,"content":"And you can see why this makes sense, right?"},{"from":4146.61,"to":4148.41,"location":2,"content":"Because there's a very natural notion of alignment,"},{"from":4148.41,"to":4151.29,"location":2,"content":"and if you can focus on the specific word or words that you're translating,"},{"from":4151.29,"to":4153.23,"location":2,"content":"you can probably do a better job."},{"from":4153.23,"to":4157.08,"location":2,"content":"Another reason why attention is cool is that it solves the bottleneck problem."},{"from":4157.08,"to":4160.89,"location":2,"content":"Uh, we were noting that the problem with having a single vector that has to represent"},{"from":4160.89,"to":4163.11,"location":2,"content":"the entire source sentence and that's the only way"},{"from":4163.11,"to":4165.45,"location":2,"content":"information can pass from encoder to decoder,"},{"from":4165.45,"to":4167.72,"location":2,"content":"means that if that encoding isn't very good,"},{"from":4167.72,"to":4169.24,"location":2,"content":"then you're not going to do well."},{"from":4169.24,"to":4171.81,"location":2,"content":"So, by contrast in- with attention,"},{"from":4171.81,"to":4174.33,"location":2,"content":"the decoder can look directly at the encoder and"},{"from":4174.33,"to":4179.21,"location":2,"content":"the source sentence and translate without the bottleneck."},{"from":4179.21,"to":4183.59,"location":2,"content":"Another great thing about attention is that it helps with the vanishing gradient problem,"},{"from":4183.59,"to":4186.14,"location":2,"content":"especially if your sentences are quite long."},{"from":4186.14,"to":4188.7,"location":2,"content":"The reason why attention helps is because you have"},{"from":4188.7,"to":4192.05,"location":2,"content":"these direct connections between the decoder and the encoder,"},{"from":4192.05,"to":4193.81,"location":2,"content":"kind of, over many time steps,"},{"from":4193.81,"to":4195.47,"location":2,"content":"so it's like a shortcut connection."},{"from":4195.47,"to":4197.25,"location":2,"content":"And just as we learned last time about, ah,"},{"from":4197.25,"to":4200.9,"location":2,"content":"skip connections being [NOISE] useful for reducing vanishing gradients,"},{"from":4200.9,"to":4202.23,"location":2,"content":"here, it's the same notion."},{"from":4202.23,"to":4203.94,"location":2,"content":"We have these, ah, long distance"},{"from":4203.94,"to":4207.26,"location":2,"content":"[NOISE] direct connections that help the gradients flow better."},{"from":4207.26,"to":4210.87,"location":2,"content":"Another great thing about attention is it provides some interpretability."},{"from":4210.87,"to":4215.52,"location":2,"content":"Ah, if you look at the attention distribution after you've produced your translation,"},{"from":4215.52,"to":4219.03,"location":2,"content":"ah, you can see what the decoder was focusing on on each step."},{"from":4219.03,"to":4220.65,"location":2,"content":"So for example, if we run our system,"},{"from":4220.65,"to":4222.78,"location":2,"content":"and we translate our running example here,"},{"from":4222.78,"to":4225.15,"location":2,"content":"then we can produce a plot kind of like this,"},{"from":4225.15,"to":4227.13,"location":2,"content":"that shows the attention distribution."},{"from":4227.13,"to":4229.32,"location":2,"content":"So here, dark means high attention,"},{"from":4229.32,"to":4230.9,"location":2,"content":"and white means low attention."},{"from":4230.9,"to":4232.95,"location":2,"content":"So you might see something like this where,"},{"from":4232.95,"to":4236.2,"location":2,"content":"um, it was, it was focusing on the different words and different steps."},{"from":4236.2,"to":4238.8,"location":2,"content":"And this is basically the same kind of"},{"from":4238.8,"to":4241.38,"location":2,"content":"plot that we had earlier with the hard notion of alignment,"},{"from":4241.38,"to":4243.9,"location":2,"content":"ah, in SMT, except that we are,"},{"from":4243.9,"to":4247.03,"location":2,"content":"we have more flexibility to have a more soft version of alignment."},{"from":4247.03,"to":4250.68,"location":2,"content":"Like for example, ah, when we produce the English would hit,"},{"from":4250.68,"to":4252.54,"location":2,"content":"perhaps we were mostly looking at entarte,"},{"from":4252.54,"to":4255.55,"location":2,"content":"but we're also looking a little bit at a."},{"from":4255.55,"to":4258.65,"location":2,"content":"So, this, ah, means that we're getting,"},{"from":4258.65,"to":4259.98,"location":2,"content":"ah, alignment for free."},{"from":4259.98,"to":4261.64,"location":2,"content":"And the reason I say for free,"},{"from":4261.64,"to":4264.06,"location":2,"content":"is because when you remember the SMT systems,"},{"from":4264.06,"to":4265.86,"location":2,"content":"the whole point there is that you had to learn"},{"from":4265.86,"to":4269.22,"location":2,"content":"an alignment system deliberately and separately."},{"from":4269.22,"to":4271.05,"location":2,"content":"You had to define the notion of alignment."},{"from":4271.05,"to":4272.73,"location":2,"content":"You had to define the model of calculating"},{"from":4272.73,"to":4275.52,"location":2,"content":"what the probability of different alignments were, and train it."},{"from":4275.52,"to":4279.7,"location":2,"content":"Whereas here, we never told the NMT system about alignments."},{"from":4279.7,"to":4281.74,"location":2,"content":"We never explicitly trained an alignment system."},{"from":4281.74,"to":4284.9,"location":2,"content":"We never had a loss function that tells you how good your alignment was."},{"from":4284.9,"to":4288.24,"location":2,"content":"We just gave the NMT system the apparatus to"},{"from":4288.24,"to":4291.72,"location":2,"content":"do something like alignment and told it to maximize the,"},{"from":4291.72,"to":4295.03,"location":2,"content":"ah, the cross entropy loss for doing machine translation,"},{"from":4295.03,"to":4298.22,"location":2,"content":"and then the network just learned alignment by itself."},{"from":4298.22,"to":4300.56,"location":2,"content":"I think this is the coolest thing about attention,"},{"from":4300.56,"to":4305.93,"location":2,"content":"is that it's learned some structure in a somewhat unsupervised way."},{"from":4305.93,"to":4308.18,"location":2,"content":"Okay. So in the last few minutes,"},{"from":4308.18,"to":4310.99,"location":2,"content":"I'm just going to, ah, generalize the notion of attention,"},{"from":4310.99,"to":4313.95,"location":2,"content":"because it turns out that attention is actually a very general, ah,"},{"from":4313.95,"to":4317.98,"location":2,"content":"deep learning technique that you can apply in lots of different circumstances."},{"from":4317.98,"to":4320.25,"location":2,"content":"So, you've seen that attention is a great way to improve"},{"from":4320.25,"to":4322.55,"location":2,"content":"the sequence-to-sequence model for MT,"},{"from":4322.55,"to":4325.38,"location":2,"content":"but you can actually use attention for other architectures that aren't"},{"from":4325.38,"to":4328.88,"location":2,"content":"seq2seq and also tasks that aren't MT."},{"from":4328.88,"to":4330.65,"location":2,"content":"So, to understand this,"},{"from":4330.65,"to":4334.74,"location":2,"content":"I'm going to somewhat redefine attention to a more general definition."},{"from":4334.74,"to":4336.89,"location":2,"content":"So here's our more general definition."},{"from":4336.89,"to":4339.52,"location":2,"content":"Suppose you have a set of values,"},{"from":4339.52,"to":4340.85,"location":2,"content":"each of which is a vector,"},{"from":4340.85,"to":4342.48,"location":2,"content":"and you also have a single vector,"},{"from":4342.48,"to":4344.02,"location":2,"content":"which we're calling the query."},{"from":4344.02,"to":4346.11,"location":2,"content":"Then attention is a way, ah,"},{"from":4346.11,"to":4349.03,"location":2,"content":"to compute a weighted sum of the values,"},{"from":4349.03,"to":4354.99,"location":2,"content":"but the way you weight it is dependent on the query [NOISE]."},{"from":4354.99,"to":4356.56,"location":2,"content":"So, we often phrased this, ah,"},{"from":4356.56,"to":4359.31,"location":2,"content":"as saying that the query is attending to the values,"},{"from":4359.31,"to":4361.94,"location":2,"content":"the idea being that you have all this information, that's in the values,"},{"from":4361.94,"to":4366.47,"location":2,"content":"and the query is somehow determining how it's gonna pay attention to the values."},{"from":4366.47,"to":4369,"location":2,"content":"So for example, in seq2seq, ah,"},{"from":4369,"to":4372.09,"location":2,"content":"the decoder hidden state is the query,"},{"from":4372.09,"to":4374.28,"location":2,"content":"the decoder hidden state on a particular time step is"},{"from":4374.28,"to":4377.7,"location":2,"content":"the query and is attending to all the encoder hidden states,"},{"from":4377.7,"to":4379.73,"location":2,"content":"which are the values."},{"from":4379.73,"to":4382.15,"location":2,"content":"All right, here's that definition again."},{"from":4382.15,"to":4386.11,"location":2,"content":"So, here's a way to kind of understand this intuitively, two alternative ways."},{"from":4386.11,"to":4388.17,"location":2,"content":"One is to think of it like this,"},{"from":4388.17,"to":4390.96,"location":2,"content":"you could think of it as the weighted sum is like"},{"from":4390.96,"to":4394.9,"location":2,"content":"a selective summary of the information in the values."},{"from":4394.9,"to":4396.3,"location":2,"content":"And I say selective,"},{"from":4396.3,"to":4399.15,"location":2,"content":"because your choice of how much you choose to draw from"},{"from":4399.15,"to":4401.98,"location":2,"content":"each value depends on the attention distribution."},{"from":4401.98,"to":4405.78,"location":2,"content":"Ah, so the distribution, ah, depends on the queries."},{"from":4405.78,"to":4409.44,"location":2,"content":"The query is determining how much you're gonna select from different, ah,"},{"from":4409.44,"to":4414.23,"location":2,"content":"values, and this is kind of similar to LSTMs that we learned about earlier this week."},{"from":4414.23,"to":4416.97,"location":2,"content":"LSTMs were all based on the idea of a gate that, ah,"},{"from":4416.97,"to":4419.77,"location":2,"content":"[NOISE] that defines how much information sho-"},{"from":4419.77,"to":4421.48,"location":2,"content":"should [NOISE] come from different elements,"},{"from":4421.48,"to":4424.03,"location":2,"content":"and the gate depends on the context."},{"from":4424.03,"to":4427.35,"location":2,"content":"So, the strength of LSTMs came from the idea that based on the context,"},{"from":4427.35,"to":4429.82,"location":2,"content":"you decide where you're going to draw information from,"},{"from":4429.82,"to":4432.1,"location":2,"content":"and this is kind of like the same idea."},{"from":4432.1,"to":4436.74,"location":2,"content":"The second way to think about attention is you could say that it's a way to obtain"},{"from":4436.74,"to":4441.07,"location":2,"content":"a fixed-size representation from an arbitrary set of representations."},{"from":4441.07,"to":4442.6,"location":2,"content":"So when I say arbitrary sets,"},{"from":4442.6,"to":4445.55,"location":2,"content":"I'm saying that you have this set of vectors called the values, right?"},{"from":4445.55,"to":4448.05,"location":2,"content":"And you could have ten values or you could have 100 values."},{"from":4448.05,"to":4450.94,"location":2,"content":"You could have, ah, [NOISE] any arbitrary number of these vectors."},{"from":4450.94,"to":4455.01,"location":2,"content":"But attention gives you a way to get a single vector,"},{"from":4455.01,"to":4457.06,"location":2,"content":"um, summary of that,"},{"from":4457.06,"to":4458.25,"location":2,"content":"which is the attention output,"},{"from":4458.25,"to":4461.06,"location":2,"content":"ah, using your query."},{"from":4461.06,"to":4463.69,"location":2,"content":"Okay. Ah, so the last thing, ah,"},{"from":4463.69,"to":4467.04,"location":2,"content":"is that there's actually several variants of attention,"},{"from":4467.04,"to":4470.25,"location":2,"content":"and, ah, this is something that you're going to look at a little in assignment four."},{"from":4470.25,"to":4472.59,"location":2,"content":"So, in our more general setting,"},{"from":4472.59,"to":4474.93,"location":2,"content":"we've seen that we have some values in the query."},{"from":4474.93,"to":4477.21,"location":2,"content":"Doing attention always involves computing"},{"from":4477.21,"to":4482.54,"location":2,"content":"the attention scores and then you apply Softmax to get the attention distribution,"},{"from":4482.54,"to":4486.53,"location":2,"content":"and then you use that attention distribution to take a weighted sum."},{"from":4486.53,"to":4489.89,"location":2,"content":"So this is, ah, always the outline of how attention works."},{"from":4489.89,"to":4492.6,"location":2,"content":"The part that can be different is this, ah, number one."},{"from":4492.6,"to":4496.19,"location":2,"content":"There are multiple ways you can compute the scores."},{"from":4496.19,"to":4499.8,"location":2,"content":"So, ah, last slides,"},{"from":4499.8,"to":4502.68,"location":2,"content":"here are the different ways you can compute the scores."},{"from":4502.68,"to":4505.02,"location":2,"content":"So the first one, which we've already seen today,"},{"from":4505.02,"to":4507.28,"location":2,"content":"is basic dot-product attention."},{"from":4507.28,"to":4511.52,"location":2,"content":"And the idea here, is that the score for a particulu- a particular value,"},{"from":4511.52,"to":4516.16,"location":2,"content":"h_i, is just the dot-product of the query and that particular value."},{"from":4516.16,"to":4517.89,"location":2,"content":"And, ah, in particular,"},{"from":4517.89,"to":4519.48,"location":2,"content":"this assumes that the size of"},{"from":4519.48,"to":4522.3,"location":2,"content":"your query vector and the size of your value vectors has to be the same,"},{"from":4522.3,"to":4524.94,"location":2,"content":"because you're taking dot-product [NOISE]."},{"from":4524.94,"to":4527.52,"location":2,"content":"Another, ah, version of ah,"},{"from":4527.52,"to":4530.34,"location":2,"content":"attention is called multiplicative attention."},{"from":4530.34,"to":4532.83,"location":2,"content":"And here, the idea is that the score of your ah,"},{"from":4532.83,"to":4539.28,"location":2,"content":"value h_i is going to be this bi-linear function of your query and that value."},{"from":4539.28,"to":4541.8,"location":2,"content":"So, in particular, we're putting this weight matrix in the middle,"},{"from":4541.8,"to":4543.35,"location":2,"content":"and that's a learnable parameter."},{"from":4543.35,"to":4547.29,"location":2,"content":"You're learning the best way matric- ma- weight matrix in order to get the scores,"},{"from":4547.29,"to":4549.56,"location":2,"content":"the attention scores that are useful."},{"from":4549.56,"to":4552.49,"location":2,"content":"The last one is called additive attention."},{"from":4552.49,"to":4553.99,"location":2,"content":"So what's happening here,"},{"from":4553.99,"to":4558.68,"location":2,"content":"is that the score of the value h_i is ah."},{"from":4558.68,"to":4561.09,"location":2,"content":"You get it by applying"},{"from":4561.09,"to":4566.06,"location":2,"content":"a linear transformation to both the value and the query and then you add them together,"},{"from":4566.06,"to":4568.14,"location":2,"content":"and then you put them through a non-linearity like tan"},{"from":4568.14,"to":4570.99,"location":2,"content":"h. And then lastly, you take that vector,"},{"from":4570.99,"to":4572.93,"location":2,"content":"and you take the dot-product with"},{"from":4572.93,"to":4577.31,"location":2,"content":"a weight vector to give you a single number that is the score."},{"from":4577.31,"to":4581.58,"location":2,"content":"So here, you've got two different weight matrices and also a weight vector,"},{"from":4581.58,"to":4583.88,"location":2,"content":"which are the learnable parameters."},{"from":4583.88,"to":4585.93,"location":2,"content":"One thing that's different here,"},{"from":4585.93,"to":4588.44,"location":2,"content":"is that there's kind of an additional hyperparameter,"},{"from":4588.44,"to":4590.65,"location":2,"content":"which is the attention dimensionality."},{"from":4590.65,"to":4592.89,"location":2,"content":"So that's kind of, ah,"},{"from":4592.89,"to":4597.36,"location":2,"content":"the- I think it's the heights of the W-1 and W-2 and it's the length of V, right?"},{"from":4597.36,"to":4600.06,"location":2,"content":"You can choose what size that dimension is."},{"from":4600.06,"to":4602.88,"location":2,"content":"It's kind of like a hidden layer in the computation."},{"from":4602.88,"to":4607.65,"location":2,"content":"So, um, you can decide how big you want that intermediate representation to be."},{"from":4607.65,"to":4610.11,"location":2,"content":"Okay. So I'm not going to tell you anymore about that because that's"},{"from":4610.11,"to":4612.15,"location":2,"content":"actually one of the questions in the assignment, ah,"},{"from":4612.15,"to":4613.5,"location":2,"content":"assignment four is to think about"},{"from":4613.5,"to":4617.3,"location":2,"content":"the relative advantages and disadvantages of these models."},{"from":4617.3,"to":4619.55,"location":2,"content":"Okay. So here's a summary of today."},{"from":4619.55,"to":4621.12,"location":2,"content":"It really is the last slide."},{"from":4621.12,"to":4623.43,"location":2,"content":"Second last, last time, but this was the last slide."},{"from":4623.43,"to":4627,"location":2,"content":"So, we learned about the history of MT [NOISE]."},{"from":4627,"to":4628.57,"location":2,"content":"We learned about how in 2014,"},{"from":4628.57,"to":4632.13,"location":2,"content":"neural MT revolutionized MT [NOISE]."},{"from":4632.13,"to":4635.34,"location":2,"content":"We learned about how sequence-to-sequence is the right architecture for NMT,"},{"from":4635.34,"to":4637.77,"location":2,"content":"and it uses two RNNs, and lastly,"},{"from":4637.77,"to":4639.87,"location":2,"content":"we learned about how attention is a way to focus on"},{"from":4639.87,"to":4643.36,"location":2,"content":"particular parts of the input. All right, thanks."}]} \ No newline at end of file diff --git a/bcc-en/9.bcc b/bcc-en/9.bcc new file mode 100644 index 0000000000000000000000000000000000000000..08c9bb59c42a399cbe45dfc3223cd1998746e030 --- /dev/null +++ b/bcc-en/9.bcc @@ -0,0 +1 @@ +{"font_size":0.4,"font_color":"#FFFFFF","background_alpha":0.5,"background_color":"#9C27B0","Stroke":"none","body":[{"from":0,"to":10.65,"location":2,"content":"[NOISE] Okay everyone, let's get started for today."},{"from":10.65,"to":15.35,"location":2,"content":"Okay. So, we're into week five of CS224n."},{"from":15.35,"to":18.45,"location":2,"content":"And so, this is the plan for today."},{"from":18.45,"to":22.08,"location":2,"content":"Um, in some sense a lot of this class is gonna be"},{"from":22.08,"to":26.95,"location":2,"content":"an easy class because I'm gonna talk about things like,"},{"from":26.95,"to":30.78,"location":2,"content":"um, final projects and tips for what you're meant to do,"},{"from":30.78,"to":32.1,"location":2,"content":"and finding a topic,"},{"from":32.1,"to":33.51,"location":2,"content":"and writing up your work,"},{"from":33.51,"to":34.68,"location":2,"content":"and things like that."},{"from":34.68,"to":36.24,"location":2,"content":"Um, so for, um, so,"},{"from":36.24,"to":39.13,"location":2,"content":"two-thirds of the class there isn't a lot of,"},{"from":39.13,"to":41.01,"location":2,"content":"um, deep technical content."},{"from":41.01,"to":42.11,"location":2,"content":"But I hope they're actually"},{"from":42.11,"to":46.61,"location":2,"content":"just some useful stuff and stuff that would be good to know about."},{"from":46.61,"to":49.82,"location":2,"content":"One way you can think about this is until,"},{"from":49.82,"to":53.09,"location":2,"content":"until this year we had a midterm in this class."},{"from":53.09,"to":56.54,"location":2,"content":"So, you know, if we weren't doing this class should instead be doing the"},{"from":56.54,"to":60.74,"location":2,"content":"the mid-term based on all the material that we've covered, um, so far."},{"from":60.74,"to":63.38,"location":2,"content":"So, this should be really pleasant by comparison."},{"from":63.38,"to":66.65,"location":2,"content":"Um, but that isn't gonna be quite the entire class."},{"from":66.65,"to":69.61,"location":2,"content":"So, for this piece here in the middle I'm gonna"},{"from":69.61,"to":73.91,"location":2,"content":"spend a while back on some of the topics of last week."},{"from":73.91,"to":79.42,"location":2,"content":"So, I wanted to have one more look at some of these gated recurrent models,"},{"from":79.42,"to":81.91,"location":2,"content":"um, that Abby introduced last week."},{"from":81.91,"to":84.38,"location":2,"content":"And I guess my hope is that now that you've"},{"from":84.38,"to":86.78,"location":2,"content":"had a bit more time to look and read about things,"},{"from":86.78,"to":91.67,"location":2,"content":"and hopefully even have started working on homework for that."},{"from":91.67,"to":96.8,"location":2,"content":"Maybe it starts to make a bit more sense or else even if it's more confusing then before,"},{"from":96.8,"to":100.1,"location":2,"content":"you've got some idea of what your confusions are and questions."},{"from":100.1,"to":101.84,"location":2,"content":"And so, hopefully it's, um,"},{"from":101.84,"to":107.72,"location":2,"content":"good to think about those one more time because I think they are quite a complex notion,"},{"from":107.72,"to":111.86,"location":2,"content":"and it's not so obvious what they're doing and why they're doing anything useful,"},{"from":111.86,"to":115.08,"location":2,"content":"or whether they're just this big complex blob of mystery."},{"from":115.08,"to":119.21,"location":2,"content":"And then also to touch on a couple of machine translation topics that have um, come up"},{"from":119.21,"to":123.3,"location":2,"content":"in the final project that we didn't really get m- time to say much about last week."},{"from":123.3,"to":124.62,"location":2,"content":"[NOISE] Okay."},{"from":124.62,"to":126.48,"location":2,"content":"So, let's get started."},{"from":126.48,"to":132.51,"location":2,"content":"Um, so, this is our coursework in grading that we showed at the beginning."},{"from":132.51,"to":137,"location":2,"content":"And so, the main thing I wanna do today is talk about this final project."},{"from":137,"to":138.62,"location":2,"content":"Um, but before tha- I do that,"},{"from":138.62,"to":142.15,"location":2,"content":"let's just save one minute on participation."},{"from":142.15,"to":147.35,"location":2,"content":"Um, so, I guess we started into one aspect of the participation policy, um,"},{"from":147.35,"to":149.78,"location":2,"content":"last Thursday when we took attendance,"},{"from":149.78,"to":151.73,"location":2,"content":"and that makes it sound draconian,"},{"from":151.73,"to":153.22,"location":2,"content":"but I wanted to say, um,"},{"from":153.22,"to":154.82,"location":2,"content":"the positive viewpoint of,"},{"from":154.82,"to":156.97,"location":2,"content":"um, the participation points."},{"from":156.97,"to":158.84,"location":2,"content":"I mean, obviously this is a big class."},{"from":158.84,"to":160.64,"location":2,"content":"There are lots of people."},{"from":160.64,"to":164.09,"location":2,"content":"Um, our hope is just that people will variously,"},{"from":164.09,"to":167.48,"location":2,"content":"they're sort of engaged and involved in the class,"},{"from":167.48,"to":169.58,"location":2,"content":"and the participation points,"},{"from":169.58,"to":171.34,"location":2,"content":"ah, are our way of doing that."},{"from":171.34,"to":173.94,"location":2,"content":"I mean, basically the way this is set up."},{"from":173.94,"to":177,"location":2,"content":"I mean, if you do much of anything"},{"from":177,"to":180.23,"location":2,"content":"you should just get three percent for the participation points."},{"from":180.23,"to":181.61,"location":2,"content":"It shouldn't be hard."},{"from":181.61,"to":185.73,"location":2,"content":"I mean, I will bet you that there will be some people who at the end,"},{"from":185.73,"to":189,"location":2,"content":"will have gotten seven points in the participation category."},{"from":189,"to":191.42,"location":2,"content":"And unfortunately we cap you, we'll only give you"},{"from":191.42,"to":194.45,"location":2,"content":"three percent for the participation category, but you know,"},{"from":194.45,"to":197.45,"location":2,"content":"providing you usually come to class,"},{"from":197.45,"to":199.4,"location":2,"content":"or usually write the,"},{"from":199.4,"to":202.16,"location":2,"content":"um, what we've got to [NOISE] the invited speakers"},{"from":202.16,"to":205.22,"location":2,"content":"the reaction paragraphs if you are an SCPD student."},{"from":205.22,"to":209.21,"location":2,"content":"Sometimes, um, write a helpful answer on Piazza, right."},{"from":209.21,"to":211.9,"location":2,"content":"You're already gonna be there on three percent."},{"from":211.9,"to":213.45,"location":2,"content":"Um, yeah."},{"from":213.45,"to":216.06,"location":2,"content":"And so, one, but one other thing, um,"},{"from":216.06,"to":219.91,"location":2,"content":"that's a way to get some parti- participation points that's out today."},{"from":219.91,"to":224.12,"location":2,"content":"So, um, today we're putting up our Mid-quarter feedback survey."},{"from":224.12,"to":226.28,"location":2,"content":"And we'd love to have you fill that in."},{"from":226.28,"to":229.56,"location":2,"content":"I mean, we'd like to get your thoughts on the course so far."},{"from":229.56,"to":231.72,"location":2,"content":"And, you know, for you guys,"},{"from":231.72,"to":233.37,"location":2,"content":"there are two ways that you can win."},{"from":233.37,"to":237.5,"location":2,"content":"First if you give us some feedback that can help the rest of your quarter be better,"},{"from":237.5,"to":240.86,"location":2,"content":"but we've also got a simple bribe built into this, um,"},{"from":240.86,"to":244.64,"location":2,"content":"which is you get half a participation point simply for filling in,"},{"from":244.64,"to":246.89,"location":2,"content":"um, the, um, Mid-quarter survey,"},{"from":246.89,"to":249.26,"location":2,"content":"but it'd be really good to get your feedback on that."},{"from":249.26,"to":251.78,"location":2,"content":"Okay. So, then the main thing I want to get to"},{"from":251.78,"to":255.93,"location":2,"content":"today is to talk about [NOISE] the final project."},{"from":255.93,"to":259.41,"location":2,"content":"Okay. And so, I'll jump right ahead, um, into that."},{"from":259.41,"to":263.24,"location":2,"content":"So, for the final project there are two choices."},{"from":263.24,"to":266.6,"location":2,"content":"Um, you, you can either do our default final project,"},{"from":266.6,"to":270.56,"location":2,"content":"which I'll say a little bit about, it's doing SQuAD question answering,"},{"from":270.56,"to":272.68,"location":2,"content":"or you can propose a final,"},{"from":272.68,"to":274.31,"location":2,"content":"a custom final project,"},{"from":274.31,"to":276.11,"location":2,"content":"which we then have to approve."},{"from":276.11,"to":277.83,"location":2,"content":"And in the course of that,"},{"from":277.83,"to":280.91,"location":2,"content":"um, if you have some outside mentor, um,"},{"from":280.91,"to":283.84,"location":2,"content":"you can say who they are and your project proposal,"},{"from":283.84,"to":289.15,"location":2,"content":"but otherwise, um, we'll attempt to assign you a mentor somewhere out of the course staff."},{"from":289.15,"to":291.3,"location":2,"content":"Um, so, for all the assignments,"},{"from":291.3,"to":293.21,"location":2,"content":"through assignments one through five,"},{"from":293.21,"to":295.5,"location":2,"content":"you have to do them by yourself."},{"from":295.5,"to":299.13,"location":2,"content":"Um, for the final project in either form of that,"},{"from":299.13,"to":300.9,"location":2,"content":"you can do it as a team."},{"from":300.9,"to":302.21,"location":2,"content":"So, you can do it as one,"},{"from":302.21,"to":304.31,"location":2,"content":"two, or three people."},{"from":304.31,"to":306.51,"location":2,"content":"And how does that work?"},{"from":306.51,"to":310.35,"location":2,"content":"Um, well, it works like this, um,"},{"from":310.35,"to":312.41,"location":2,"content":"if you're a bigger team,"},{"from":312.41,"to":314.57,"location":2,"content":"we do expect you to do more,"},{"from":314.57,"to":317.82,"location":2,"content":"and there are actually two ways you can be a bigger team that I'll point out."},{"from":317.82,"to":320.9,"location":2,"content":"One way is having more people being two or three people."},{"from":320.9,"to":323.75,"location":2,"content":"And the other thing that comes up is, um,"},{"from":323.75,"to":327.97,"location":2,"content":"sometimes people wanna do a final project for more than one class at the same time."},{"from":327.97,"to":330.05,"location":2,"content":"In particular for this quarter I know there are"},{"from":330.05,"to":332.48,"location":2,"content":"at least a couple of people who are hoping to do,"},{"from":332.48,"to":337.06,"location":2,"content":"um, a joint project with Emma's reinforcement learning class."},{"from":337.06,"to":338.68,"location":2,"content":"And we allow that as well."},{"from":338.68,"to":343.49,"location":2,"content":"But we sort of do multiplication because if you're two people using it for two classes,"},{"from":343.49,"to":346.91,"location":2,"content":"that means it should be four times as great as"},{"from":346.91,"to":350.39,"location":2,"content":"what one person is doing for one class, right?"},{"from":350.39,"to":354.47,"location":2,"content":"So, how, how it works with larger teams, you know,"},{"from":354.47,"to":359.51,"location":2,"content":"in all honesty it's a little bit subtle because, you know,"},{"from":359.51,"to":362.98,"location":2,"content":"the truth is if something is just bad, um,"},{"from":362.98,"to":365.5,"location":2,"content":"your model was broken, um,"},{"from":365.5,"to":368.54,"location":2,"content":"or you, your experiment failed,"},{"from":368.54,"to":370.39,"location":2,"content":"um, and you don't know why."},{"from":370.39,"to":376.04,"location":2,"content":"Um, you know. If, if there's just obvious ways in what you've done as bad as it's sort of,"},{"from":376.04,"to":379.32,"location":2,"content":"it's sort of bad whether you're one person or four person."},{"from":379.32,"to":381.86,"location":2,"content":"Um, and if you've written it up beautifully,"},{"from":381.86,"to":383.84,"location":2,"content":"you've written up beautifully regardless of whether"},{"from":383.84,"to":386.24,"location":2,"content":"you're one person or four per- people,"},{"from":386.24,"to":392,"location":2,"content":"that you know nevertheless the expectation is that if you're one person will be pleased,"},{"from":392,"to":395.96,"location":2,"content":"that if you put together one model and gotten it to work well, um,"},{"from":395.96,"to":398.79,"location":2,"content":"but if you're three people will say, \"Well,"},{"from":398.79,"to":400.9,"location":2,"content":"that wasn't such a big effort, um,"},{"from":400.9,"to":403.62,"location":2,"content":"running this one model against this task.\""},{"from":403.62,"to":405.31,"location":2,"content":"Surely if there are three people,"},{"from":405.31,"to":406.7,"location":2,"content":"they could have investigated"},{"from":406.7,"to":411.51,"location":2,"content":"some other model classes and seeing whether they perform better or worse on this task."},{"from":411.51,"to":413.45,"location":2,"content":"And we'll feel a sense of lightweight."},{"from":413.45,"to":418.18,"location":2,"content":"So, we are expecting that sort of both more ambitious projects,"},{"from":418.18,"to":421.19,"location":2,"content":"and more thorough exploration of them if you're"},{"from":421.19,"to":424.74,"location":2,"content":"being a bigger team or you're using it for multiple classes."},{"from":424.74,"to":426.41,"location":2,"content":"Um, for the final project,"},{"from":426.41,"to":429.55,"location":2,"content":"you are allowed to use any language or deep learning,"},{"from":429.55,"to":431.96,"location":2,"content":"um, framework that you choose to."},{"from":431.96,"to":433.91,"location":2,"content":"We don't insist on what you use,"},{"from":433.91,"to":436.02,"location":2,"content":"though in practice in past years."},{"from":436.02,"to":438.73,"location":2,"content":"Basically everyone keeps on using what they've learned,"},{"from":438.73,"to":439.88,"location":2,"content":"um, in the assignments."},{"from":439.88,"to":441.69,"location":2,"content":"I expect that will be true, um,"},{"from":441.69,"to":444.03,"location":2,"content":"this time as well. [NOISE]"},{"from":444.03,"to":449.88,"location":2,"content":"Okay. So, um, let me just mention quickly the default final project,"},{"from":449.88,"to":451.32,"location":2,"content":"so that you've got, um,"},{"from":451.32,"to":453.12,"location":2,"content":"some sense of context."},{"from":453.12,"to":456.38,"location":2,"content":"So, the materials of that will be released this Thursday."},{"from":456.38,"to":458.46,"location":2,"content":"And so, for the tasks for it is,"},{"from":458.46,"to":462.09,"location":2,"content":"a textural question-answering task which is done over the,"},{"from":462.09,"to":465.24,"location":2,"content":"the Stanford Question Answering Dataset, SQuAD,"},{"from":465.24,"to":467.48,"location":2,"content":"which was a dataset put together, um,"},{"from":467.48,"to":471.87,"location":2,"content":"by Percy Liang and the department and the student ."},{"from":471.87,"to":475.38,"location":2,"content":"Um, so, we've used this as a default final project,"},{"from":475.38,"to":478.68,"location":2,"content":"um, before but we're mixing up a couple of things this year."},{"from":478.68,"to":483.84,"location":2,"content":"I mean, firstly, the starter code we're providing this year is in pytorch,"},{"from":483.84,"to":486.46,"location":2,"content":"to fit in with what we've done to the rest of the class."},{"from":486.46,"to":489.76,"location":2,"content":"But secondly, the SQuAD team,"},{"from":489.76,"to":491.7,"location":2,"content":"released a new version of SQuAD,"},{"from":491.7,"to":495.84,"location":2,"content":"SQuAD 2.0 and we're going to use that for the class this year."},{"from":495.84,"to":498.63,"location":2,"content":"And the essential difference in SQuAD 2.0,"},{"from":498.63,"to":501.98,"location":2,"content":"is in SQuAD 1.1 or 1.0,"},{"from":501.98,"to":508.06,"location":2,"content":"every question had an answer in the passage of text whereas in SQuAD 2.0,"},{"from":508.06,"to":510.21,"location":2,"content":"a lot of questions don't have answers."},{"from":510.21,"to":514.77,"location":2,"content":"So, there's this extra significant thing that you need to do which is working out,"},{"from":514.77,"to":516.96,"location":2,"content":"um, whether a question has an answer."},{"from":516.96,"to":519.51,"location":2,"content":"So, th- this is just one example,"},{"from":519.51,"to":523.43,"location":2,"content":"um, which just gives you a sense of the SQuAD, what SQuAD is like."},{"from":523.43,"to":525.68,"location":2,"content":"So, there's a paragraph of text."},{"from":525.68,"to":528.68,"location":2,"content":"I've just put a subset of it here, um, Bill Aken,"},{"from":528.68,"to":532.46,"location":2,"content":"adopted by Mexican movie actress, Lupe Mayorga, um,"},{"from":532.46,"to":535.29,"location":2,"content":"grew up in the neighborhood town, neighboring, sorry,"},{"from":535.29,"to":537.99,"location":2,"content":"neighboring town of Madeira and his song chronicled"},{"from":537.99,"to":541.65,"location":2,"content":"the hardships faced by the migrant farm workers he saw as a child."},{"from":541.65,"to":544.03,"location":2,"content":"Right, there's then a question, um,"},{"from":544.03,"to":545.76,"location":2,"content":"in what town did Bill,"},{"from":545.76,"to":547.65,"location":2,"content":"right, actually I misspelled that sorry,"},{"from":547.65,"to":553.23,"location":2,"content":"it should have been Aken without an I. I got confused with our former department chair,"},{"from":553.23,"to":555.32,"location":2,"content":"Alex Aiken, I guess when I was typing."},{"from":555.32,"to":557.17,"location":2,"content":"Um, Bill Aken grow up?"},{"from":557.17,"to":559.92,"location":2,"content":"And the answer you are meant to give is Madeira."},{"from":559.92,"to":562.32,"location":2,"content":"Um, so, just incidentally,"},{"from":562.32,"to":564.18,"location":2,"content":"it's a random fact."},{"from":564.18,"to":568.5,"location":2,"content":"Um, so, quite a few of you know about something that was"},{"from":568.5,"to":570.45,"location":2,"content":"recently in the kind of tech news, tech"},{"from":570.45,"to":573.28,"location":2,"content":"news and we're going to talk about later in the class."},{"from":573.28,"to":574.86,"location":2,"content":"Um, that people, um,"},{"from":574.86,"to":579.01,"location":2,"content":"from Google produced this very strong New Natural Language"},{"from":579.01,"to":582.09,"location":2,"content":"Understanding representation model called BERT."},{"from":582.09,"to":586.7,"location":2,"content":"And which is one of several kind of models that are in a class of,"},{"from":586.7,"to":592.65,"location":2,"content":"models that contextually model words that have come into prominence in 2017 and 18."},{"from":592.65,"to":598.77,"location":2,"content":"And in general, BERT has sort of produced very good performance for very many tasks."},{"from":598.77,"to":603.9,"location":2,"content":"Indeed, if you look at the SQuAD 2.0 leader board online, um,"},{"from":603.9,"to":606.98,"location":2,"content":"at this URL, what you'll find is that"},{"from":606.98,"to":611.63,"location":2,"content":"all of the leading systems use BERT in some way or another, these days."},{"from":611.63,"to":614.91,"location":2,"content":"Um, but nevertheless, this was actually a question that BERT got wrong."},{"from":614.91,"to":616.29,"location":2,"content":"Um, that BERT said,"},{"from":616.29,"to":618,"location":2,"content":"\"No answer to this question,"},{"from":618,"to":619.68,"location":2,"content":"\" rather than getting the correct answer."},{"from":619.68,"to":623.07,"location":2,"content":"Even though it looks kind of straightforward reading it as a human being."},{"from":623.07,"to":627.32,"location":2,"content":"It doesn't really look a human tricky reading comprehension question."},{"from":627.32,"to":630.39,"location":2,"content":"Um, so, that's the default final project."},{"from":630.39,"to":635.36,"location":2,"content":"So, on Thursday, I'm going to talk more about the default final project."},{"from":635.36,"to":639.25,"location":2,"content":"I'm going to talk about how people build textual question answering systems."},{"from":639.25,"to":643.74,"location":2,"content":"And the details on the default final project should all be posted by then,"},{"from":643.74,"to":647.22,"location":2,"content":"but that's just to give you a bit of context of what the other choice is."},{"from":647.22,"to":651.16,"location":2,"content":"And today, I'm sort of more going to be aiming at people,"},{"from":651.16,"to":654.18,"location":2,"content":"um, doing the custom final project."},{"from":654.18,"to":658.59,"location":2,"content":"But let me just sort of say a bit first about the choice between the two of them."},{"from":658.59,"to":662.94,"location":2,"content":"So, um, why might you want to choose the default final project?"},{"from":662.94,"to":667.32,"location":2,"content":"So, if you have limited experience with research,"},{"from":667.32,"to":672.18,"location":2,"content":"you don't have any clear idea of a research project you want to do this quarter,"},{"from":672.18,"to":674.85,"location":2,"content":"you're just really busy with other classes that, uh,"},{"from":674.85,"to":677.7,"location":2,"content":"you're enrolled in CS140 and you're just really loade- loaded"},{"from":677.7,"to":681.2,"location":2,"content":"[LAUGHTER] now with other classes you're doing this quarter."},{"from":681.2,"to":685.89,"location":2,"content":"Um, you'd be happy to have just a clear goal towards, to work towards."},{"from":685.89,"to":689.55,"location":2,"content":"A leaderboard of your fellow students that you can compete against."},{"from":689.55,"to":691.89,"location":2,"content":"Um, do the default final project."},{"from":691.89,"to":696.51,"location":2,"content":"Um, I think for many people it's actually the good right choice."},{"from":696.51,"to":698.67,"location":2,"content":"And I mean, for what it's worth, I mean,"},{"from":698.67,"to":703.16,"location":2,"content":"typically, slightly over half of people have done the default final project."},{"from":703.16,"to":705.17,"location":2,"content":"It's normally that, so 55 percent have done"},{"from":705.17,"to":708.68,"location":2,"content":"the default final project and the rest the custom final project."},{"from":708.68,"to":711.14,"location":2,"content":"So, if you do the default final project,"},{"from":711.14,"to":712.72,"location":2,"content":"you'll get lots of guidance."},{"from":712.72,"to":714.54,"location":2,"content":"You get lots of scaffolding."},{"from":714.54,"to":718.36,"location":2,"content":"There are clear things to aim at in what you do."},{"from":718.36,"to":724.01,"location":2,"content":"Um, the course staff are in general most prepared and most able to help you."},{"from":724.01,"to":725.99,"location":2,"content":"Um, and in particular,"},{"from":725.99,"to":729,"location":2,"content":"I mean, the, for the bottom bullet here."},{"from":729,"to":731.51,"location":2,"content":"I mean, you know, something to think about in making"},{"from":731.51,"to":736.04,"location":2,"content":"the choices that some of it comes down to how committed,"},{"from":736.04,"to":741.32,"location":2,"content":"organized, and keen are you to be wanting to do your own custom final project."},{"from":741.32,"to":744.89,"location":2,"content":"If you've got a, something you really want to do for a custom final project, great."},{"from":744.89,"to":748.27,"location":2,"content":"We love to see interesting custom final projects."},{"from":748.27,"to":752.76,"location":2,"content":"But, you know, if you're going to end up doing something that just looks"},{"from":752.76,"to":759.15,"location":2,"content":"worse like [LAUGHTER] not done as well [LAUGHTER] as you would've done a, done a project."},{"from":759.15,"to":762.09,"location":2,"content":"If you'd just done the fin-, default final project,"},{"from":762.09,"to":765.09,"location":2,"content":"then you should probably choose the default final project [LAUGHTER]."},{"from":765.09,"to":767.18,"location":2,"content":"Um, okay."},{"from":767.18,"to":768.78,"location":2,"content":"But even if you are doing,"},{"from":768.78,"to":771.13,"location":2,"content":"think you'll do the default final project."},{"from":771.13,"to":774.62,"location":2,"content":"I hope that some of this lecture will still, um, be useful."},{"from":774.62,"to":776.66,"location":2,"content":"While the part in the middle, when I talk back about"},{"from":776.66,"to":779.52,"location":2,"content":"MT and Gater or current networks are definitely useful."},{"from":779.52,"to":781.67,"location":2,"content":"But, you know, beyond that, um,"},{"from":781.67,"to":785.35,"location":2,"content":"some of the tips on doing research and discussions of,"},{"from":785.35,"to":790.23,"location":2,"content":"sort of looking at how to make neural networks work and error analysis, paper writing."},{"from":790.23,"to":794.72,"location":2,"content":"These are all good topics that apply to the default final project as well."},{"from":794.72,"to":796.77,"location":2,"content":"So, in the other direction, um,"},{"from":796.77,"to":799.68,"location":2,"content":"if you have some research project that you're excited about."},{"from":799.68,"to":802.59,"location":2,"content":"Possibly, it's one you are already working on or possibly,"},{"from":802.59,"to":804.62,"location":2,"content":"that you've just always wished to do."},{"from":804.62,"to":807.69,"location":2,"content":"Something exciting with neural networks and rap music."},{"from":807.69,"to":812.34,"location":2,"content":"Um, well, you know, that custom final project is an opportunity to do that."},{"from":812.34,"to":815.55,"location":2,"content":"Um, so, it's a chance for you to do something on your own."},{"from":815.55,"to":817.98,"location":2,"content":"Um, it, you know, obviously,"},{"from":817.98,"to":820.2,"location":2,"content":"if you're not interested in textural question-answering"},{"from":820.2,"to":822.15,"location":2,"content":"but do you think you might like machine translation."},{"from":822.15,"to":823.74,"location":2,"content":"Well, it's an opportunity, um,"},{"from":823.74,"to":825.76,"location":2,"content":"to choose any topic of your own."},{"from":825.76,"to":832.59,"location":2,"content":"It's also a way to sort of experience much more of the research pro- process because,"},{"from":832.59,"to":835.29,"location":2,"content":"you know, for the default final project, it's a bigger,"},{"from":835.29,"to":838.54,"location":2,"content":"more open-ended thing than any of our assignments."},{"from":838.54,"to":839.89,"location":2,"content":"But, you know, nevertheless,"},{"from":839.89,"to":841.8,"location":2,"content":"the default final project is still"},{"from":841.8,"to":845.79,"location":2,"content":"sort of a pre-setup thing that you don't have to find your own problem,"},{"from":845.79,"to":847.18,"location":2,"content":"find your own data,"},{"from":847.18,"to":848.94,"location":2,"content":"work out a good approach to it."},{"from":848.94,"to":850.98,"location":2,"content":"A lot of that's sort of been done for you."},{"from":850.98,"to":854.5,"location":2,"content":"So, that, for a custom final project it's much more"},{"from":854.5,"to":858.9,"location":2,"content":"your own job to sort of define and execute a mini research project."},{"from":858.9,"to":862.44,"location":2,"content":"And so, if all of that stuff seems appealing or some of it seems appealing,"},{"from":862.44,"to":864.98,"location":2,"content":"um, then aim at the custom final project."},{"from":864.98,"to":870.04,"location":2,"content":"Um, doing this just reminded me about a fact about assignments one to five."},{"from":870.04,"to":872.31,"location":2,"content":"You know, for assignments one to five,"},{"from":872.31,"to":876.09,"location":2,"content":"we are hoping that they can be a set of stepping"},{"from":876.09,"to":879.88,"location":2,"content":"stones for learning how to build deep learning systems."},{"from":879.88,"to":887.31,"location":2,"content":"But, you know, one of our goals in that is to give you less hand holds as time goes by."},{"from":887.31,"to":891.65,"location":2,"content":"So, you know, assignment one was really easy and assignment three,"},{"from":891.65,"to":893.88,"location":2,"content":"we tried to make it really handholdy,"},{"from":893.88,"to":896.7,"location":2,"content":"so people could start to learn PyTorch."},{"from":896.7,"to":899.28,"location":2,"content":"But, you know, we're actually hoping for assignments"},{"from":899.28,"to":902.28,"location":2,"content":"four and five that they're actually harder,"},{"from":902.28,"to":904.85,"location":2,"content":"so that you're getting more experience of working"},{"from":904.85,"to":907.43,"location":2,"content":"out how to build and do things by yourself"},{"from":907.43,"to":912.83,"location":2,"content":"because if the only thing you ever see is completely scaffolded assignments."},{"from":912.83,"to":917.3,"location":2,"content":"It's sort of like when you do CS106A that you have to do a great job on"},{"from":917.3,"to":921.98,"location":2,"content":"the CS106A assignments but you don't really know how to write a program by yourselves."},{"from":921.98,"to":923.55,"location":2,"content":"And that's sort of what we want to, um,"},{"from":923.55,"to":925.31,"location":2,"content":"sort of get you beyond,"},{"from":925.31,"to":927.05,"location":2,"content":"um, in the latter two assignments."},{"from":927.05,"to":929.86,"location":2,"content":"So, I hope you have started on assignment four."},{"from":929.86,"to":934.88,"location":2,"content":"If not, you really should start and get underway soon as Abby was emphasizing."},{"from":934.88,"to":937.58,"location":2,"content":"Okay. So, this year for the,"},{"from":937.58,"to":940.93,"location":2,"content":"um, final project, whichever one you're doing."},{"from":940.93,"to":943.77,"location":2,"content":"Um, we're actually putting more structure in than we have"},{"from":943.77,"to":946.73,"location":2,"content":"in previous years to encourage people to get going."},{"from":946.73,"to":948.03,"location":2,"content":"And so, in particular,"},{"from":948.03,"to":952.19,"location":2,"content":"there are early on components which are worth points in the grading."},{"from":952.19,"to":955.5,"location":2,"content":"So, the first part of that is a project proposal,"},{"from":955.5,"to":957.41,"location":2,"content":"um, which is, um,"},{"from":957.41,"to":959.02,"location":2,"content":"we want from each team."},{"from":959.02,"to":960.91,"location":2,"content":"So, one per team, um,"},{"from":960.91,"to":962.67,"location":2,"content":"you can just do a joint one,"},{"from":962.67,"to":964.62,"location":2,"content":"um, which is worth five percent."},{"from":964.62,"to":968.3,"location":2,"content":"Um, so, it's, we're releasing the details on Thursday which is when"},{"from":968.3,"to":972.65,"location":2,"content":"assignment four is due and it'll be due the following Thursday."},{"from":972.65,"to":976.43,"location":2,"content":"So, we're actually having an interruption in the sequence of current assignments, right."},{"from":976.43,"to":979.25,"location":2,"content":"So, for the next week, um,"},{"from":979.25,"to":982.8,"location":2,"content":"what the thing to do is project proposal."},{"from":982.8,"to":984.77,"location":2,"content":"And then the week after that, um,"},{"from":984.77,"to":989.08,"location":2,"content":"we're back to assignment five and then we go full time into final project."},{"from":989.08,"to":990.59,"location":2,"content":"So, what we're wanting for"},{"from":990.59,"to":994.25,"location":2,"content":"the project proposal is we're actually wanting you to do a little bit"},{"from":994.25,"to":999.47,"location":2,"content":"of starting off research and the fine ter- terms of reading some paper."},{"from":999.47,"to":1001.75,"location":2,"content":"So, find some paper that's, um,"},{"from":1001.75,"to":1003.52,"location":2,"content":"relevant to your research,"},{"from":1003.52,"to":1005.54,"location":2,"content":"um, that you are going to do."},{"from":1005.54,"to":1009.22,"location":2,"content":"Um, read it, write a summary of what it does."},{"from":1009.22,"to":1014.27,"location":2,"content":"Um, write down some thoughts on how you could adapt or extend ideas in it,"},{"from":1014.27,"to":1016.45,"location":2,"content":"in your own final project."},{"from":1016.45,"to":1019.81,"location":2,"content":"Um, and then say something about what your plan is for"},{"from":1019.81,"to":1023.08,"location":2,"content":"what you're goi- hoping to do for your final project."},{"from":1023.08,"to":1025.66,"location":2,"content":"And especially, if you're doing a custom final project"},{"from":1025.66,"to":1028.21,"location":2,"content":"there's more to write there because we'll want to make"},{"from":1028.21,"to":1030.44,"location":2,"content":"sure that you have some idea as to"},{"from":1030.44,"to":1033.36,"location":2,"content":"what data you can use and how are you going to evaluate it."},{"from":1033.36,"to":1036.13,"location":2,"content":"Whereas a couple of those things are actually sort of"},{"from":1036.13,"to":1040.43,"location":2,"content":"determined for you if you're doing the default final project."},{"from":1040.43,"to":1045.54,"location":2,"content":"Um, and so then after that we're going to have a project milestone, um,"},{"from":1045.54,"to":1048.39,"location":2,"content":"which is the progress report where we're hoping that you can"},{"from":1048.39,"to":1051.3,"location":2,"content":"report that you're well along in your final project."},{"from":1051.3,"to":1053.85,"location":2,"content":"That you've run at least some experiment and have"},{"from":1053.85,"to":1057.08,"location":2,"content":"some results on some data that you can talk about."},{"from":1057.08,"to":1059.82,"location":2,"content":"So the default- the project milestone is due on,"},{"from":1059.82,"to":1061.79,"location":2,"content":"um, Thursday, March seven."},{"from":1061.79,"to":1065.01,"location":2,"content":"So it's actually more than halfway through"},{"from":1065.01,"to":1068.13,"location":2,"content":"the period that's sort of dedicated to the final project."},{"from":1068.13,"to":1071.22,"location":2,"content":"So, if you are not- we sort of put it past"},{"from":1071.22,"to":1074.97,"location":2,"content":"halfway because the fact of the matter is it always takes people time to get going,"},{"from":1074.97,"to":1076.63,"location":2,"content":"um, but nevertheless, you know,"},{"from":1076.63,"to":1079.35,"location":2,"content":"what you should have in your head is unless you're halfway"},{"from":1079.35,"to":1082.44,"location":2,"content":"through by the time you're handing in your,"},{"from":1082.44,"to":1086.04,"location":2,"content":"um, project milestone, then you're definitely behind."},{"from":1086.04,"to":1089.91,"location":2,"content":"And you'll be doing that typical Stanford thing of having a lot of late nights"},{"from":1089.91,"to":1094.76,"location":2,"content":"and lack of sleep in the last week [LAUGHTER] of class trying to catch up for that."},{"from":1094.76,"to":1097.48,"location":2,"content":"Um, okay. So, um,"},{"from":1097.48,"to":1099.02,"location":2,"content":"so now I've sort of, um,"},{"from":1099.02,"to":1102.9,"location":2,"content":"want to sort of just start saying a bit of- for"},{"from":1102.9,"to":1105.27,"location":2,"content":"custom final projects of some of the sort of"},{"from":1105.27,"to":1108.19,"location":2,"content":"thinking and types of things that you could do about that."},{"from":1108.19,"to":1111.57,"location":2,"content":"Um, so you have to determine some project,"},{"from":1111.57,"to":1115.14,"location":2,"content":"um, for- if you're doing a custom final project."},{"from":1115.14,"to":1117.33,"location":2,"content":"So, in philosophy of science, you know,"},{"from":1117.33,"to":1120.81,"location":2,"content":"there are basically two ways for any field you can have a project."},{"from":1120.81,"to":1124.52,"location":2,"content":"You either start with some domain problem of interest."},{"from":1124.52,"to":1128.46,"location":2,"content":"You're [NOISE] just got something you're interested in or say,"},{"from":1128.46,"to":1131.89,"location":2,"content":"\"Gee, I'd like to do better machine translation.\""},{"from":1131.89,"to":1135.22,"location":2,"content":"And then you work out some ways to address it with technology,"},{"from":1135.22,"to":1136.56,"location":2,"content":"or you start with some, um,"},{"from":1136.56,"to":1138.7,"location":2,"content":"technical approach of interest."},{"from":1138.7,"to":1140.55,"location":2,"content":"And you say, \"Oh well,"},{"from":1140.55,"to":1142.5,"location":2,"content":"those LSTMs seemed kind of neat,"},{"from":1142.5,"to":1144.36,"location":2,"content":"but I didn't understand why there's"},{"from":1144.36,"to":1148.04,"location":2,"content":"that extra 10H and I think it'd be better if it changed in this other way."},{"from":1148.04,"to":1153.57,"location":2,"content":"And you start exploring from a technical direction to try and come up with a better idea."},{"from":1153.57,"to":1155.97,"location":2,"content":"And then you're wanting to prove that it works."},{"from":1155.97,"to":1160.59,"location":2,"content":"So in kinds of the projects that people do for this class,"},{"from":1160.59,"to":1162.51,"location":2,"content":"this isn't quite an exhaustive list,"},{"from":1162.51,"to":1164.97,"location":2,"content":"but this is sort of in general what people do."},{"from":1164.97,"to":1168.51,"location":2,"content":"So, the first category and really I think this"},{"from":1168.51,"to":1172.08,"location":2,"content":"is the bulk of projects over half is people find"},{"from":1172.08,"to":1175.65,"location":2,"content":"some task replication of interest and they build"},{"from":1175.65,"to":1179.74,"location":2,"content":"some neural network models to try and do it as effectively as possible."},{"from":1179.74,"to":1187.02,"location":2,"content":"Um, there's a second category where people sort of concentrate on implementing,"},{"from":1187.02,"to":1193.58,"location":2,"content":"so re-implementing some complex neural architecture and getting it to work on some data."},{"from":1193.58,"to":1197.12,"location":2,"content":"And so let me just say a couple of sentences on this."},{"from":1197.12,"to":1201.53,"location":2,"content":"Um, so, it's certainly okay for you to,"},{"from":1201.53,"to":1205.39,"location":2,"content":"um, start by re-implementing some existing model."},{"from":1205.39,"to":1210.97,"location":2,"content":"Um, and some people that's as far as they get."},{"from":1210.97,"to":1214.63,"location":2,"content":"And then the question is, um, is that okay?"},{"from":1214.63,"to":1217.65,"location":2,"content":"And the answer to whether that's okay sort"},{"from":1217.65,"to":1220.92,"location":2,"content":"of largely depends on how complex your neural model is."},{"from":1220.92,"to":1228.06,"location":2,"content":"Um, so if what you think is okay I'm going to, um,"},{"from":1228.06,"to":1231.27,"location":2,"content":"re-implement something like we've seen already,"},{"from":1231.27,"to":1234.6,"location":2,"content":"like a window-based classification model and you"},{"from":1234.6,"to":1238.11,"location":2,"content":"just re-implement that and run it on some data and get some results and stop."},{"from":1238.11,"to":1240.36,"location":2,"content":"That's definitely a bad project."},{"from":1240.36,"to":1245.1,"location":2,"content":"Um, but there are lots of very complicated and sophisticated neural,"},{"from":1245.1,"to":1247.07,"location":2,"content":"um, architectures out there."},{"from":1247.07,"to":1251.79,"location":2,"content":"And if you're trying to do something complicated well then that can be a fine project."},{"from":1251.79,"to":1255.84,"location":2,"content":"Um, so, I actually sort of stuck in a few examples of projects."},{"from":1255.84,"to":1260.49,"location":2,"content":"So, I mean, here's one that was actually from a couple of years ago."},{"from":1260.49,"to":1263.54,"location":2,"content":"Um, so this was in the 2017 class."},{"from":1263.54,"to":1267.15,"location":2,"content":"And so, shortly before the 2017 class,"},{"from":1267.15,"to":1271.23,"location":2,"content":"\"Deep Mind\" who's one of the um, organizations producing"},{"from":1271.23,"to":1274.38,"location":2,"content":"the most complicated neural models had just released"},{"from":1274.38,"to":1277.89,"location":2,"content":"a paper about the differentiable neural computer model,"},{"from":1277.89,"to":1280.17,"location":2,"content":"which was a model of how to have something like"},{"from":1280.17,"to":1283.11,"location":2,"content":"a differentiate- differentiable Turing machine-like"},{"from":1283.11,"to":1286.66,"location":2,"content":"architecture inside a neural network, um,"},{"from":1286.66,"to":1289.05,"location":2,"content":"and thought, um,"},{"from":1289.05,"to":1292.23,"location":2,"content":"this would be a great challenge to try and, um,"},{"from":1292.23,"to":1296.97,"location":2,"content":"re-implement the differentiable neural computer which Deep Mind hadn't released"},{"from":1296.97,"to":1299.1,"location":2,"content":"any source code for because they're not the kind of"},{"from":1299.1,"to":1301.86,"location":2,"content":"place that generally releases their source code."},{"from":1301.86,"to":1306.42,"location":2,"content":"Um, and, you know, this was actually an extremely ambitious project because it"},{"from":1306.42,"to":1311.84,"location":2,"content":"was, it's a very complex architecture which is hard to get to train."},{"from":1311.84,"to":1314.27,"location":2,"content":"And so, you know, at the end,"},{"from":1314.27,"to":1318.18,"location":2,"content":"at the end she hadn't been able to sort of train as"},{"from":1318.18,"to":1322.23,"location":2,"content":"big a model or get as good results as they report in the paper that,"},{"from":1322.23,"to":1324.03,"location":2,"content":"you know, frankly we thought it was pretty"},{"from":1324.03,"to":1327.12,"location":2,"content":"miraculous that she managed to get it working at all."},{"from":1327.12,"to":1331.92,"location":2,"content":"In the period of time we had in the class and she did successfully do an open-source"},{"from":1331.92,"to":1336.6,"location":2,"content":"re-implementation of this model which basically worked the same as in their paper."},{"from":1336.6,"to":1337.77,"location":2,"content":"Though not quite as well."},{"from":1337.77,"to":1339.81,"location":2,"content":"So, you know, that seemed a huge achievement."},{"from":1339.81,"to":1343.9,"location":2,"content":"So, you certainly can do something of that sort."},{"from":1343.9,"to":1348.21,"location":2,"content":"Right. So, um, so you- you can sort of from"},{"from":1348.21,"to":1352.85,"location":2,"content":"a technical direction have some ideas for variant model and explore,"},{"from":1352.85,"to":1355.65,"location":2,"content":"um, how to make a different kind of model class and then look"},{"from":1355.65,"to":1359.07,"location":2,"content":"at how it works on some problem that works well."},{"from":1359.07,"to":1363.2,"location":2,"content":"Another kind of project you can do is an analysis project,"},{"from":1363.2,"to":1365.69,"location":2,"content":"so that you might be interested in something in"},{"from":1365.69,"to":1369.52,"location":2,"content":"natural language or something on the behavior of neural networks,"},{"from":1369.52,"to":1372.74,"location":2,"content":"and just think that you want to analyze them more closely."},{"from":1372.74,"to":1374.7,"location":2,"content":"So, you might think, \"Oh,"},{"from":1374.7,"to":1378.23,"location":2,"content":"maybe these neural machine translation systems work great"},{"from":1378.23,"to":1382.53,"location":2,"content":"providing the word order is the same in the source and target language,"},{"from":1382.53,"to":1387.18,"location":2,"content":"but can they really do a good job of reordering phrases for different language types?"},{"from":1387.18,"to":1389.67,"location":2,"content":"How much does their performance vary based on"},{"from":1389.67,"to":1392.63,"location":2,"content":"the amount of reordering between the source and target language?\""},{"from":1392.63,"to":1394.76,"location":2,"content":"And you could do some experiments to try and"},{"from":1394.76,"to":1398.61,"location":2,"content":"investigate that as an analysis problem that looks at a model,"},{"from":1398.61,"to":1401.01,"location":2,"content":"and we sometimes get projects like that."},{"from":1401.01,"to":1404.04,"location":2,"content":"Down at the bottom is the rarest kind of project,"},{"from":1404.04,"to":1406.86,"location":2,"content":"which is when some people try to do something"},{"from":1406.86,"to":1410.63,"location":2,"content":"theoretical which is to prove some properties of a system."},{"from":1410.63,"to":1415.41,"location":2,"content":"So if- this is easiest to do in simple systems for something like word vectors,"},{"from":1415.41,"to":1419.13,"location":2,"content":"that if you might want to prove something about"},{"from":1419.13,"to":1423.16,"location":2,"content":"the kind of spaces that are induced by word vectors,"},{"from":1423.16,"to":1425.49,"location":2,"content":"and what properties you need to have in"},{"from":1425.49,"to":1429.38,"location":2,"content":"models for word analogies to work or something like that."},{"from":1429.38,"to":1433.99,"location":2,"content":"Um here are just another couple of examples that so- shows some of the other classes."},{"from":1433.99,"to":1437.94,"location":2,"content":"So, this one is an example of find a problem and build some models."},{"from":1437.94,"to":1444.15,"location":2,"content":"So, these three people um, looked at Shakespearean Sonnet generation and then they considered"},{"from":1444.15,"to":1447.78,"location":2,"content":"several different models for Shakespearean Sonnet generation and"},{"from":1447.78,"to":1451.77,"location":2,"content":"got the best results from this sort of- you'd probably can't really see all the details,"},{"from":1451.77,"to":1455.07,"location":2,"content":"but they have a sort of a mixture of word level and"},{"from":1455.07,"to":1458.4,"location":2,"content":"character level gated model that feeds into"},{"from":1458.4,"to":1463.13,"location":2,"content":"a word level LSTM and produces sonnets and the output wasn't totally bad."},{"from":1463.13,"to":1466.31,"location":2,"content":"\"Thy youth's time and face his form shall cover."},{"from":1466.31,"to":1468.87,"location":2,"content":"Now all fresh beauty my love there."},{"from":1468.87,"to":1472.29,"location":2,"content":"Will ever time to greet forget each like ever decease,"},{"from":1472.29,"to":1475.82,"location":2,"content":"but in a- in a best at worship his glory die.\""},{"from":1475.82,"to":1477.78,"location":2,"content":"Okay. It's maybe not perfect,"},{"from":1477.78,"to":1481.96,"location":2,"content":"[LAUGHTER] but it sort of sounds like a Shakespearean sonnet."},{"from":1481.96,"to":1484.16,"location":2,"content":"Um, okay."},{"from":1484.16,"to":1486.88,"location":2,"content":"Yeah. So, I showed you that one already."},{"from":1486.88,"to":1494.21,"location":2,"content":"Um, here's, um, an example of someone who designed a different kind of network,"},{"from":1494.21,"to":1498.76,"location":2,"content":"and this was a project that came out of this class that was then continued with,"},{"from":1498.76,"to":1501.31,"location":2,"content":"and the- they got a conference paper out of it,"},{"from":1501.31,"to":1503.86,"location":2,"content":"the ICLR 2017 paper."},{"from":1503.86,"to":1509.44,"location":2,"content":"So, this was looking at doing a better job at building a neural language model."},{"from":1509.44,"to":1512.11,"location":2,"content":"And essentially, they had two ideas,"},{"from":1512.11,"to":1516.43,"location":2,"content":"both of which seem useful for building better neural language models."},{"from":1516.43,"to":1520.75,"location":2,"content":"And so, one is that in the stuff that we've presented so far,"},{"from":1520.75,"to":1522.79,"location":2,"content":"whether it was the early word vectors,"},{"from":1522.79,"to":1525.61,"location":2,"content":"or what Abby presented last week in the neural language model,"},{"from":1525.61,"to":1530.44,"location":2,"content":"there are effectively two vectors for each word: there's one for the word encoding"},{"from":1530.44,"to":1535.42,"location":2,"content":"on the input and then when you have the softmax on the other side effectively,"},{"from":1535.42,"to":1539.5,"location":2,"content":"the rows of that matrix that go into the softmax are also"},{"from":1539.5,"to":1544.19,"location":2,"content":"word vectors for determining how likely you are to produce different words."},{"from":1544.19,"to":1548.71,"location":2,"content":"And so, um, these two people had the idea that maybe if we actually in the model"},{"from":1548.71,"to":1554.95,"location":2,"content":"tied those two word ve- vectors together that would help and produce a better model and,"},{"from":1554.95,"to":1557.23,"location":2,"content":"um, and so this was actually done"},{"from":1557.23,"to":1560.86,"location":2,"content":"several years ago when that was a novel idea which hadn't actually been done."},{"from":1560.86,"to":1564.09,"location":2,"content":"So, this was done in the 2016 class,"},{"from":1564.09,"to":1566.88,"location":2,"content":"and then they had this second idea which was,"},{"from":1566.88,"to":1569.08,"location":2,"content":"well maybe doing the kind of,"},{"from":1569.08,"to":1571.66,"location":2,"content":"cross entropy one, zero,"},{"from":1571.66,"to":1574.6,"location":2,"content":"sort of you look at the correct word that you are meant to"},{"from":1574.6,"to":1578.62,"location":2,"content":"produce and sort of work out a loss based on that."},{"from":1578.62,"to":1581.14,"location":2,"content":"Maybe that's not very good because you don't get"},{"from":1581.14,"to":1585.52,"location":2,"content":"partial points if you produce a different word that's semantically similar."},{"from":1585.52,"to":1588.1,"location":2,"content":"And so, that they had this idea that they could use"},{"from":1588.1,"to":1593.35,"location":2,"content":"word vector similarity and then you'd be giving a score for any word that was"},{"from":1593.35,"to":1596.31,"location":2,"content":"produced next based on how similar it was"},{"from":1596.31,"to":1599.47,"location":2,"content":"according to word vector similarity to the word that you are"},{"from":1599.47,"to":1601.72,"location":2,"content":"meant to produce next and that was also"},{"from":1601.72,"to":1605.88,"location":2,"content":"a useful idea that they're able to produce improved language models with."},{"from":1605.88,"to":1607.42,"location":2,"content":"So, that was a cool project."},{"from":1607.42,"to":1610.18,"location":2,"content":"Um, here's an example of, um,"},{"from":1610.18,"to":1612.01,"location":2,"content":"somebody from last year,"},{"from":1612.01,"to":1614.56,"location":2,"content":"um, who did an analysis project."},{"from":1614.56,"to":1617.13,"location":2,"content":"So, their idea was,"},{"from":1617.13,"to":1619.66,"location":2,"content":"um, that they- well,"},{"from":1619.66,"to":1620.68,"location":2,"content":"they were going to, um,"},{"from":1620.68,"to":1622.35,"location":2,"content":"evaluate on some task,"},{"from":1622.35,"to":1624.16,"location":2,"content":"they actually did several tasks, um,"},{"from":1624.16,"to":1627.13,"location":2,"content":"word similarity, analogy, and the SQuAD,"},{"from":1627.13,"to":1629.21,"location":2,"content":"um, question answering system."},{"from":1629.21,"to":1631.18,"location":2,"content":"But the question was, okay,"},{"from":1631.18,"to":1636.23,"location":2,"content":"a lot of neural network models are big and so aren't very suitable for phones, um,"},{"from":1636.23,"to":1641.95,"location":2,"content":"could we get away with compressing the models a lot so that rather than having doubles,"},{"from":1641.95,"to":1645.58,"location":2,"content":"or 32-bit floats, or even 16-bit floats,"},{"from":1645.58,"to":1648.6,"location":2,"content":"that are now used quite a bit in neural networks, could we,"},{"from":1648.6,"to":1652.9,"location":2,"content":"um, compress a lot more and quantize, um,"},{"from":1652.9,"to":1655.45,"location":2,"content":"numeric values so that we can only be, say,"},{"from":1655.45,"to":1660.38,"location":2,"content":"using two bits fo- per parameter so they'll literally need four bits per parameter?"},{"from":1660.38,"to":1662.89,"location":2,"content":"And if you do that naively, it doesn't work."},{"from":1662.89,"to":1668.5,"location":2,"content":"But if you explore some cleverer ways of doing it and see how to make things work,"},{"from":1668.5,"to":1671.45,"location":2,"content":"you can actually get it to work, um, really well."},{"from":1671.45,"to":1674.68,"location":2,"content":"Um, in fact, it actually seems like sometimes you can improve"},{"from":1674.68,"to":1679.39,"location":2,"content":"your performance doing this because the quantization acts as a form of regularizer."},{"from":1679.39,"to":1683.29,"location":2,"content":"Um, you can find lots of other projects, um, online,"},{"from":1683.29,"to":1687.14,"location":2,"content":"if you look at the CS224n pages and you should."},{"from":1687.14,"to":1688.99,"location":2,"content":"Um, okay."},{"from":1688.99,"to":1692.83,"location":2,"content":"So, if you want to do a final project you have to find someplace to start."},{"from":1692.83,"to":1695.95,"location":2,"content":"You know, one place is to start looking at papers there's"},{"from":1695.95,"to":1699.76,"location":2,"content":"online anthology of most of the NLP conference papers."},{"from":1699.76,"to":1703.69,"location":2,"content":"You can look at M- ML conferences have lots of relevant papers as well."},{"from":1703.69,"to":1708.71,"location":2,"content":"You can look at past CS224n papers that cover lots of topics."},{"from":1708.71,"to":1713.2,"location":2,"content":"Um, though, you know, I- I sugge- don't also forget, um,"},{"from":1713.2,"to":1716.18,"location":2,"content":"the advice down the bottom, um,"},{"from":1716.18,"to":1719.98,"location":2,"content":"which is look for an interesting problem in the world."},{"from":1719.98,"to":1723.67,"location":2,"content":"Um, so, our Stanford's CS emeritus professor"},{"from":1723.67,"to":1727.24,"location":2,"content":"Ed Feigenbaum likes to quote the advice of his,"},{"from":1727.24,"to":1730.46,"location":2,"content":"um, advisor, Herb Simon, um,"},{"from":1730.46,"to":1735.65,"location":2,"content":"of \"If you see a research area where many people are working, go somewhere else.\""},{"from":1735.65,"to":1736.87,"location":2,"content":"Um, well, you know,"},{"from":1736.87,"to":1741.12,"location":2,"content":"in the context of this class don't go so far away that you're not using"},{"from":1741.12,"to":1745.83,"location":2,"content":"neural networks or NLP because that won't work for project for this class."},{"from":1745.83,"to":1748.09,"location":2,"content":"But, you know, nevertheless, I mean,"},{"from":1748.09,"to":1750.25,"location":2,"content":"in some sense it's a bad strategy of"},{"from":1750.25,"to":1752.92,"location":2,"content":"saying let's look at all the papers that were published last year,"},{"from":1752.92,"to":1755.48,"location":2,"content":"and let's wo- start working on one of their problems,"},{"from":1755.48,"to":1758.61,"location":2,"content":"or lots of people are working on question-answering, I'll do it too."},{"from":1758.61,"to":1761.69,"location":2,"content":"You know, there are lots of interesting different problems"},{"from":1761.69,"to":1764.24,"location":2,"content":"in the world and if you know of some, you know,"},{"from":1764.24,"to":1768.34,"location":2,"content":"cool website that somehow does something interesting related to language,"},{"from":1768.34,"to":1771.51,"location":2,"content":"you know, maybe you can make a final project out of that."},{"from":1771.51,"to":1774.68,"location":2,"content":"Um, other ways to find final projects."},{"from":1774.68,"to":1778.09,"location":2,"content":"Um, so the person who's first put together most of"},{"from":1778.09,"to":1783.22,"location":2,"content":"the CS231n content was And- Andrej Karpathy, um,"},{"from":1783.22,"to":1786.76,"location":2,"content":"who now works at Tesla and among his other- things"},{"from":1786.76,"to":1790.73,"location":2,"content":"he did for the world he put together this site Arxiv Sanity Preserver, um,"},{"from":1790.73,"to":1794.56,"location":2,"content":"which is a way to find online archive papers which is"},{"from":1794.56,"to":1799,"location":2,"content":"a major pre-print server and if you say a few papers you're interested in,"},{"from":1799,"to":1801.43,"location":2,"content":"it'll show you other papers that you're interested in."},{"from":1801.43,"to":1803.76,"location":2,"content":"It'll show you papers that are currently trending."},{"from":1803.76,"to":1805.7,"location":2,"content":"So, that can be a good way to look."},{"from":1805.7,"to":1808.15,"location":2,"content":"Um, if you think it'd be just good to be in"},{"from":1808.15,"to":1810.61,"location":2,"content":"some competition where you're wanting to"},{"from":1810.61,"to":1813.2,"location":2,"content":"build a system that's better than other people's,"},{"from":1813.2,"to":1816.41,"location":2,"content":"um, you can look at leaderboards for various tasks."},{"from":1816.41,"to":1819.16,"location":2,"content":"So, there's this brand new site which is pretty good though"},{"from":1819.16,"to":1821.95,"location":2,"content":"not completely error free and correct, of"},{"from":1821.95,"to":1826.12,"location":2,"content":"paperswithcode.com, and it collects a whole lot of"},{"from":1826.12,"to":1831.19,"location":2,"content":"leaderboards for a whole lot of machine learning tasks including tons of language ones."},{"from":1831.19,"to":1833.86,"location":2,"content":"So, it gives leaderboards for question answering,"},{"from":1833.86,"to":1835.99,"location":2,"content":"machine translation, named entity recognition,"},{"from":1835.99,"to":1838.09,"location":2,"content":"language modeling, part of speech tagging."},{"from":1838.09,"to":1840.12,"location":2,"content":"All sorts of tasks you can find there,"},{"from":1840.12,"to":1844.92,"location":2,"content":"and find out what the current states of the art and datasets are."},{"from":1844.92,"to":1848.47,"location":2,"content":"Okay. Um, so, you know,"},{"from":1848.47,"to":1850.3,"location":2,"content":"different projects are different,"},{"from":1850.3,"to":1854.68,"location":2,"content":"but often for a lot of projects the things you need to be making sure of is"},{"from":1854.68,"to":1859.21,"location":2,"content":"that something that you can get a decent amount of data about so you can train a model."},{"from":1859.21,"to":1860.8,"location":2,"content":"It's a feasible task,"},{"from":1860.8,"to":1864.1,"location":2,"content":"it's not so enormous you can't possibly do it in four weeks."},{"from":1864.1,"to":1868.42,"location":2,"content":"Um, you'll want to have some evaluation metric and"},{"from":1868.42,"to":1870.76,"location":2,"content":"normally for deep learning you have to have-"},{"from":1870.76,"to":1873.22,"location":2,"content":"even if you hope to do some human evaluation,"},{"from":1873.22,"to":1877.11,"location":2,"content":"as well, you have to have some automatic evaluation metric."},{"from":1877.11,"to":1879.65,"location":2,"content":"Because unless there's just some code that you can run"},{"from":1879.65,"to":1882.41,"location":2,"content":"that gives you a score for how well you're doing,"},{"from":1882.41,"to":1884.02,"location":2,"content":"then unless you have that,"},{"from":1884.02,"to":1887.92,"location":2,"content":"you just sort of can't do the deep learning trick of saying, \"Okay,"},{"from":1887.92,"to":1894.04,"location":2,"content":"let's, um, do backpropagation to optimize our scores according to this metric.\""},{"from":1894.04,"to":1899.05,"location":2,"content":"And pretty much you'll want to do that to be able to do neural network optimization."},{"from":1899.05,"to":1905.02,"location":2,"content":"Um, and we do require that there is an important part of NLP in your class project."},{"from":1905.02,"to":1906.4,"location":2,"content":"I mean, it doesn't have to be only thing,"},{"from":1906.4,"to":1908.66,"location":2,"content":"you can be doing reinforcement learning as well,"},{"from":1908.66,"to":1911.38,"location":2,"content":"or you could do images to caption, say you're"},{"from":1911.38,"to":1913.3,"location":2,"content":"doing joint vision and NLP,"},{"from":1913.3,"to":1915.65,"location":2,"content":"but there has to be NLP in it."},{"from":1915.65,"to":1922.35,"location":2,"content":"Okay. Ah, last bit before I get back onto the content from last week."},{"from":1922.35,"to":1927.61,"location":2,"content":"Ah, so, something that you'll need to do is have data for your project."},{"from":1927.61,"to":1932.37,"location":2,"content":"Um, so some people collect their own data for a project and, you know,"},{"from":1932.37,"to":1934.7,"location":2,"content":"it's not impossible to collect your own data"},{"from":1934.7,"to":1937.95,"location":2,"content":"especially if there's something you can do with unsupervised data."},{"from":1937.95,"to":1941.45,"location":2,"content":"You might be able to get it by just sort of crawling an interesting website."},{"from":1941.45,"to":1945.17,"location":2,"content":"You can annotate a small amount of data yourself."},{"from":1945.17,"to":1948.66,"location":2,"content":"If you have any site that has some kind of, you know,"},{"from":1948.66,"to":1951.33,"location":2,"content":"ratings annotation stars on it,"},{"from":1951.33,"to":1956.21,"location":2,"content":"you can treat those as a form of, ah, annotation."},{"from":1956.21,"to":1961.98,"location":2,"content":"Right? So, if you want to predict something like, um, you know,"},{"from":1961.98,"to":1966.67,"location":2,"content":"which descriptions on product review websites"},{"from":1966.67,"to":1970.23,"location":2,"content":"or which reviews on product review websites do people like?"},{"from":1970.23,"to":1973.29,"location":2,"content":"Well, they get star ratings at the bottom from people and"},{"from":1973.29,"to":1976.61,"location":2,"content":"then you can try and fit to that as your supervision."},{"from":1976.61,"to":1981.03,"location":2,"content":"Um, sometimes people have data from an existing project for a company."},{"from":1981.03,"to":1982.63,"location":2,"content":"You can use that."},{"from":1982.63,"to":1985.33,"location":2,"content":"But nevertheless for most people, um,"},{"from":1985.33,"to":1988.13,"location":2,"content":"given that classes are short and things like that,"},{"from":1988.13,"to":1990.53,"location":2,"content":"the practical thing to do is use"},{"from":1990.53,"to":1995.19,"location":2,"content":"an existing curated dataset that's been built by previous researchers."},{"from":1995.19,"to":2000.12,"location":2,"content":"That normally gives you a fast start and lets you get to work building models, um,"},{"from":2000.12,"to":2001.93,"location":2,"content":"there's obvious prior work,"},{"from":2001.93,"to":2004.63,"location":2,"content":"there are baselines and previous systems"},{"from":2004.63,"to":2008.25,"location":2,"content":"that you can compare your performance on, et cetera."},{"from":2008.25,"to":2012.04,"location":2,"content":"Okay. Um, so, where can you find data?"},{"from":2012.04,"to":2015.14,"location":2,"content":"I'll just mention a couple of places here and there are lots more."},{"from":2015.14,"to":2017.47,"location":2,"content":"So, traditionally the biggest source of"},{"from":2017.47,"to":2020.54,"location":2,"content":"linguistic data used by academics was this place called"},{"from":2020.54,"to":2023.42,"location":2,"content":"the Linguistic Data Consortium and they have lots of"},{"from":2023.42,"to":2026.96,"location":2,"content":"datasets for treebanks and named entities and coreference,"},{"from":2026.96,"to":2028.98,"location":2,"content":"parallel machine, translation data,"},{"from":2028.98,"to":2030.4,"location":2,"content":"et cetera, et cetera."},{"from":2030.4,"to":2035.31,"location":2,"content":"And so, um, the Linguistic Data Consortium licenses their data,"},{"from":2035.31,"to":2039.11,"location":2,"content":"Stanford pays that license so you can use any of it."},{"from":2039.11,"to":2041.5,"location":2,"content":"Um, but if you want to use it, um,"},{"from":2041.5,"to":2045.36,"location":2,"content":"you go to that, um, linguistics.stanford.edu page."},{"from":2045.36,"to":2048.32,"location":2,"content":"And there's a sign-up, um, ah,"},{"from":2048.32,"to":2052.49,"location":2,"content":"piece on how to sign up where you basically, um, say,"},{"from":2052.49,"to":2054.2,"location":2,"content":"\"I will use this data only for"},{"from":2054.2,"to":2057.94,"location":2,"content":"good Stanford purposes and not as the basis of my startup.\""},{"from":2057.94,"to":2061.07,"location":2,"content":"And, um, then you can have access to that data"},{"from":2061.07,"to":2064.78,"location":2,"content":"and it can be made available by NFS or otherwise."},{"from":2064.78,"to":2067.34,"location":2,"content":"Um, but as time has gone by,"},{"from":2067.34,"to":2072.28,"location":2,"content":"there's a ton of curated NLP data that's available on various websites."},{"from":2072.28,"to":2074.61,"location":2,"content":"In fact, if anything the problem is it's just sort of"},{"from":2074.61,"to":2077.99,"location":2,"content":"spread over the web and that's sort of hard to find different things."},{"from":2077.99,"to":2082.31,"location":2,"content":"But there are some, some sites that have a lot of data for various purposes."},{"from":2082.31,"to":2085.97,"location":2,"content":"So, anything related to machine translation or just parallel,"},{"from":2085.97,"to":2087.97,"location":2,"content":"um, data across different languages."},{"from":2087.97,"to":2092.68,"location":2,"content":"The statistical MT statmt.org site has a great amount of"},{"from":2092.68,"to":2097.43,"location":2,"content":"data and that organization runs shared tasks every year,"},{"from":2097.43,"to":2099.32,"location":2,"content":"the Workshop on Machine Translation,"},{"from":2099.32,"to":2103.36,"location":2,"content":"WMT which Abby already mentioned in her class."},{"from":2103.36,"to":2105.28,"location":2,"content":"And they've got datasets that we use for"},{"from":2105.28,"to":2108.21,"location":2,"content":"those tasks and then there are leaderboards for those tasks."},{"from":2108.21,"to":2110.41,"location":2,"content":"And you can find data for that."},{"from":2110.41,"to":2113.9,"location":2,"content":"Um, if you thought dependency parsing was cool, um,"},{"from":2113.9,"to":2118.7,"location":2,"content":"there's the Universal Dependencies site which has parallel, not parallel site,"},{"from":2118.7,"to":2121.72,"location":2,"content":"which has treebanks in the same annotation scheme for"},{"from":2121.72,"to":2124.3,"location":2,"content":"about 60 different languages and you can work on"},{"from":2124.3,"to":2127.8,"location":2,"content":"parsers for different languages and things like that."},{"from":2127.8,"to":2131.33,"location":2,"content":"Um, I'm not gonna bore you with going through all of them but, you know,"},{"from":2131.33,"to":2133.84,"location":2,"content":"there are just tons and tons of other datasets that"},{"from":2133.84,"to":2137.68,"location":2,"content":"Facebook has released datasets, Google's released datasets,"},{"from":2137.68,"to":2141.38,"location":2,"content":"I said Stanford have released several other datasets including"},{"from":2141.38,"to":2145.23,"location":2,"content":"the Stanford Sentiment Treebank and the Stanford Na- Natural Language, um,"},{"from":2145.23,"to":2148.78,"location":2,"content":"Inference corpus, uh, new question-answering datasets and"},{"from":2148.78,"to":2152.98,"location":2,"content":"including HotPotQA and conversational question answering."},{"from":2152.98,"to":2156.18,"location":2,"content":"Other groups at different universities have released datasets."},{"from":2156.18,"to":2157.66,"location":2,"content":"There are just tons of them."},{"from":2157.66,"to":2162.95,"location":2,"content":"You can find data on sites like Kaggle where it has machine-learning competitions."},{"from":2162.95,"to":2166.02,"location":2,"content":"There are sites with lists of datasets."},{"from":2166.02,"to":2169.86,"location":2,"content":"You can look at research papers and see what datasets they used."},{"from":2169.86,"to":2172.7,"location":2,"content":"And of course, you can ask the course staff or on Piazza"},{"from":2172.7,"to":2176.3,"location":2,"content":"to try and find suitable datasets for a project."},{"from":2176.3,"to":2179.57,"location":2,"content":"Okay. Um, so that's a fair bit about"},{"from":2179.57,"to":2183.18,"location":2,"content":"the projects that I've got a bit more to say later about doing projects."},{"from":2183.18,"to":2188.64,"location":2,"content":"Does anyone have any questions up until now on projects?"},{"from":2188.64,"to":2194.18,"location":2,"content":"Okay. Um, well, so now we're gonna sort of, um,"},{"from":2194.18,"to":2199.2,"location":2,"content":"flip a switch in our brains and go back and have one more look,"},{"from":2199.2,"to":2202.11,"location":2,"content":"um, at gated recurrent units,"},{"from":2202.11,"to":2205.49,"location":2,"content":"um, and what happens and what they mean."},{"from":2205.49,"to":2207.24,"location":2,"content":"Um, and, you know,"},{"from":2207.24,"to":2208.72,"location":2,"content":"this is sort of,"},{"from":2208.72,"to":2211.57,"location":2,"content":"sort of the same material that Abby presented,"},{"from":2211.57,"to":2214.07,"location":2,"content":"presented a little bit differently but, you know,"},{"from":2214.07,"to":2217.13,"location":2,"content":"I hope it might just sort of give one more way of"},{"from":2217.13,"to":2220.52,"location":2,"content":"sort of thinking a bit about what's happening about"},{"from":2220.52,"to":2223.8,"location":2,"content":"these gated recurrent units and why they might be doing"},{"from":2223.8,"to":2227.45,"location":2,"content":"something useful and what are the alternatives to them."},{"from":2227.45,"to":2231.64,"location":2,"content":"So, if you remember the problem we started with is that we"},{"from":2231.64,"to":2236.53,"location":2,"content":"wanted to understand sort of derivatives backward in time."},{"from":2236.53,"to":2238.27,"location":2,"content":"And so, the idea of that is well,"},{"from":2238.27,"to":2242.06,"location":2,"content":"if we twiddle this a little bit at time T,"},{"from":2242.06,"to":2247.24,"location":2,"content":"how much effect is that going to have so we make some adjustment here."},{"from":2247.24,"to":2252.05,"location":2,"content":"How much effect is that going to have n time steps later?"},{"from":2252.05,"to":2258.21,"location":2,"content":"Um, and well, we sort of looked at the derivatives and we sort of saw we got these,"},{"from":2258.21,"to":2261.9,"location":2,"content":"um, terms for each successive time step."},{"from":2261.9,"to":2268.7,"location":2,"content":"And so as Abby discussed the problem is that for the derivatives that we got,"},{"from":2268.7,"to":2272.22,"location":2,"content":"we kind of got this matrix form for each time step."},{"from":2272.22,"to":2275.16,"location":2,"content":"And so that if we're going through a lot of time steps,"},{"from":2275.16,"to":2280.59,"location":2,"content":"we got a lot of matrix multiplies and as the result of those matrix multiplies,"},{"from":2280.59,"to":2283.28,"location":2,"content":"pretty much either things disappeared down to"},{"from":2283.28,"to":2287.28,"location":2,"content":"zero or exploded upward depending on what was in the matrix."},{"from":2287.28,"to":2290.24,"location":2,"content":"And so that- and so that's sort of means we,"},{"from":2290.24,"to":2291.59,"location":2,"content":"When the gradient goes to zero,"},{"from":2291.59,"to":2294.87,"location":2,"content":"we kind of can't know what's happening there."},{"from":2294.87,"to":2298.63,"location":2,"content":"Whether there isn't any conditioning or just we can't measure it."},{"from":2298.63,"to":2303.03,"location":2,"content":"And so that's sort of made people think that maybe this naive, um,"},{"from":2303.03,"to":2309.35,"location":2,"content":"recurrent neural network transition function just isn't a good one to use."},{"from":2309.35,"to":2313.76,"location":2,"content":"And that sort of leads into these ideas of gated recurrent units."},{"from":2313.76,"to":2315.93,"location":2,"content":"Right? Because if we have"},{"from":2315.93,"to":2319.24,"location":2,"content":"the simple recurrent neural network where we're"},{"from":2319.24,"to":2322.8,"location":2,"content":"sort of feeding forward for each step in time."},{"from":2322.8,"to":2325.52,"location":2,"content":"Well, what happens is when we backpropagate."},{"from":2325.52,"to":2326.95,"location":2,"content":"We have to backpropagate through"},{"from":2326.95,"to":2332.23,"location":2,"content":"every intermediate node and that's where we sort of have our gradients disappear."},{"from":2332.23,"to":2337.19,"location":2,"content":"And so an idea of how you could fix that is to say well,"},{"from":2337.19,"to":2343.13,"location":2,"content":"suppose we just put in direct connections that were longer distance, um,"},{"from":2343.13,"to":2347.22,"location":2,"content":"then we'd also get direct backpropagation signal"},{"from":2347.22,"to":2351.86,"location":2,"content":"and so then we wouldn't have this same problem of vanishing gradients."},{"from":2351.86,"to":2357.13,"location":2,"content":"And effectively, we've sort of looked at two ways in which you can achieve that effect."},{"from":2357.13,"to":2361.24,"location":2,"content":"Because one way of you can achieve that effect which Abby looked at"},{"from":2361.24,"to":2365.45,"location":2,"content":"in the end part of the last lecture was this idea of attention."},{"from":2365.45,"to":2367.45,"location":2,"content":"So, when you've got attention,"},{"from":2367.45,"to":2371.89,"location":2,"content":"you're actually are creating these shortcut connections,"},{"from":2371.89,"to":2373.77,"location":2,"content":"oops, they're the blue ones, um,"},{"from":2373.77,"to":2378.87,"location":2,"content":"from every time step and using it to calculate an attention distribution."},{"from":2378.87,"to":2381.32,"location":2,"content":"But the way the attention was done that we looked at,"},{"from":2381.32,"to":2386.13,"location":2,"content":"it was sort of mushing together all previous time steps into some kind of an average."},{"from":2386.13,"to":2390.55,"location":2,"content":"But the idea of the gated recurrent units is in some sense we want to"},{"from":2390.55,"to":2395.76,"location":2,"content":"achieve this same kind of ability to have shortcut connections."},{"from":2395.76,"to":2397.95,"location":2,"content":"But we want to do it in"},{"from":2397.95,"to":2404.41,"location":2,"content":"a more controlled and adaptive fashion where we still do remember the position of things."},{"from":2404.41,"to":2408.97,"location":2,"content":"So, how can we create an adaptive shortcut connection?"},{"from":2408.97,"to":2410.77,"location":2,"content":"And so that's, um,"},{"from":2410.77,"to":2417.58,"location":2,"content":"what we start to do with the gates that are put into a gated recurrent network."},{"from":2417.58,"to":2422.36,"location":2,"content":"So, if- so first off we sort of say let's have"},{"from":2422.36,"to":2426.22,"location":2,"content":"a candidate update which is exactly the same"},{"from":2426.22,"to":2430.39,"location":2,"content":"as the one that's used in a simple recurrent neural network."},{"from":2430.39,"to":2434.28,"location":2,"content":"But what we can do is add a gate."},{"from":2434.28,"to":2437.89,"location":2,"content":"And so, the gate will calculate a value from zero to one."},{"from":2437.89,"to":2441.59,"location":2,"content":"And so what we're going to do here is mix together"},{"from":2441.59,"to":2446.21,"location":2,"content":"using our candidate update which is just like"},{"from":2446.21,"to":2451.72,"location":2,"content":"a simple recurrent neural network which will be then mixed together with simply"},{"from":2451.72,"to":2457.84,"location":2,"content":"directly carrying forward the hidden state from the previous time step."},{"from":2457.84,"to":2462.78,"location":2,"content":"So, once we're doing that we are sort of then adaptively-"},{"from":2462.78,"to":2469.99,"location":2,"content":"we're adaptively partly using a computation from one time step back,"},{"from":2469.99,"to":2473.08,"location":2,"content":"um, done as a recurrent neural network."},{"from":2473.08,"to":2476.98,"location":2,"content":"And we're partly just inheriting the,"},{"from":2476.98,"to":2479.54,"location":2,"content":"we're just part- sorry, we're partly inheriting"},{"from":2479.54,"to":2482.26,"location":2,"content":"the hidden state from the previous time step."},{"from":2482.26,"to":2486.24,"location":2,"content":"So, it's sort of like a shortcut connection but we're waiting as to"},{"from":2486.24,"to":2490.84,"location":2,"content":"how much we're short cutting and how much we're doing our computation."},{"from":2490.84,"to":2498.75,"location":2,"content":"And we control that adaptive choice by using a calculation to set the gate."},{"from":2498.75,"to":2501.07,"location":2,"content":"And we do that with a sigmoid, um,"},{"from":2501.07,"to":2506.54,"location":2,"content":"computed over the import and the hidden- previous hidden state and using it again,"},{"from":2506.54,"to":2510.78,"location":2,"content":"an equation kind of like a simple recurrent neural network."},{"from":2510.78,"to":2513.93,"location":2,"content":"Okay. Um, but, you know,"},{"from":2513.93,"to":2517.72,"location":2,"content":"if you wanted to go a bit further than that,"},{"from":2517.72,"to":2520.38,"location":2,"content":"um, you could think well,"},{"from":2520.38,"to":2525.82,"location":2,"content":"maybe sometimes we sort of might actually"},{"from":2525.82,"to":2531.43,"location":2,"content":"just want to get rid of the stuff that was in the past."},{"from":2531.43,"to":2535.47,"location":2,"content":"That maybe the stuff in the past sometimes becomes irrelevant, like,"},{"from":2535.47,"to":2538.29,"location":2,"content":"maybe sometimes we start a new sentence or a new"},{"from":2538.29,"to":2541.91,"location":2,"content":"thought and we just want to get rid of the stuff that's in the past."},{"from":2541.91,"to":2545.7,"location":2,"content":"And so, that can lead into this idea of having a second gate,"},{"from":2545.7,"to":2551.36,"location":2,"content":"a reset gate and so the reset gate calculates a value from 0 to 1, um,"},{"from":2551.36,"to":2553.07,"location":2,"content":"just like the other gates,"},{"from":2553.07,"to":2558.66,"location":2,"content":"and then we're doing this element wise dot-product between"},{"from":2558.66,"to":2564.43,"location":2,"content":"the reset gate and the previous hidden state and that's then sort of saying well,"},{"from":2564.43,"to":2567.9,"location":2,"content":"maybe we want to keep some parts of what was stored"},{"from":2567.9,"to":2572.36,"location":2,"content":"previously and some parts that we now want to throw away."},{"from":2572.36,"to":2576.15,"location":2,"content":"And so we put that into the model as a second gate."},{"from":2576.15,"to":2581.01,"location":2,"content":"Um, and so an interesting way to think about that is to sort of think"},{"from":2581.01,"to":2585.54,"location":2,"content":"about this as if this recurrent neural network is like"},{"from":2585.54,"to":2590.13,"location":2,"content":"a little tiny computer as the kind of little tiny computers you"},{"from":2590.13,"to":2595.03,"location":2,"content":"might do in a sort of simple architecture class and if you think about it that way,"},{"from":2595.03,"to":2600.3,"location":2,"content":"um, for the basic simple recurrent neural network"},{"from":2600.3,"to":2605.47,"location":2,"content":"the way the tiny computer works is that you've got a bank of registers h,"},{"from":2605.47,"to":2610.03,"location":2,"content":"your hidden state, and at each time step you have to"},{"from":2610.03,"to":2617.91,"location":2,"content":"read- whoops, at each time step you have to read the entirety of your bank of registers,"},{"from":2617.91,"to":2621,"location":2,"content":"you do some computation and then you write"},{"from":2621,"to":2624.6,"location":2,"content":"the entirety of your bank of registers and, you know,"},{"from":2624.6,"to":2627.96,"location":2,"content":"if in terms of thinking about computer architecture,"},{"from":2627.96,"to":2632.19,"location":2,"content":"that sounds like a pretty bad way to implement a simple computer."},{"from":2632.19,"to":2637.55,"location":2,"content":"Um, so precisely what a gated recurrent unit is doing is saying,"},{"from":2637.55,"to":2641.96,"location":2,"content":"\"Well, maybe we can have a slightly more sophisticated little baby computer.\""},{"from":2641.96,"to":2648.09,"location":2,"content":"Instead of that, we could select a subset of the registers that we want to read."},{"from":2648.09,"to":2651.17,"location":2,"content":"And so, the reset gate can control that because it can say,"},{"from":2651.17,"to":2653.72,"location":2,"content":"\"We'll just ignore a bunch of the other registers.\""},{"from":2653.72,"to":2660.78,"location":2,"content":"Um, it then will compute a new value based on just these, um,"},{"from":2660.78,"to":2667.22,"location":2,"content":"stored registers and then the update gate which is also adaptive can say, \"Well,"},{"from":2667.22,"to":2669.3,"location":2,"content":"I want you to write"},{"from":2669.3,"to":2674.58,"location":2,"content":"some registers but the rest of the registers will just keep their previous value.\""},{"from":2674.58,"to":2677.49,"location":2,"content":"That seems a useful idea to have in a computer."},{"from":2677.49,"to":2679.68,"location":2,"content":"And so, that's what we're doing here."},{"from":2679.68,"to":2682.71,"location":2,"content":"And so, this model here is, um,"},{"from":2682.71,"to":2689.11,"location":2,"content":"what was- Abby presented second as the gated recurrent unit."},{"from":2689.11,"to":2693.39,"location":2,"content":"So, this is sort of a much more realistic model"},{"from":2693.39,"to":2697.51,"location":2,"content":"and it sort of in some sense overlaps with the ideas of attention."},{"from":2697.51,"to":2703.24,"location":2,"content":"Okay. Um, so gated recurrent units are actually a quite new model."},{"from":2703.24,"to":2707.97,"location":2,"content":"Um, the model that was done way earlier and has had huge impact"},{"from":2707.97,"to":2713.34,"location":2,"content":"is these LSTM long short-term memory units and they are a bit more complex."},{"from":2713.34,"to":2715.03,"location":2,"content":"Um, but, you know,"},{"from":2715.03,"to":2717.69,"location":2,"content":"a lot of it is sort of the same, right?"},{"from":2717.69,"to":2720.21,"location":2,"content":"So, the hidden state of"},{"from":2720.21,"to":2725.04,"location":2,"content":"a gated recurrent unit is kind of equivalent to the cell of the LSTM."},{"from":2725.04,"to":2729.99,"location":2,"content":"So, both of them are using the same idea of summing together,"},{"from":2729.99,"to":2734.46,"location":2,"content":"a mixture of just directly interpret- directly inheriting"},{"from":2734.46,"to":2739.14,"location":2,"content":"what you had from the previous time step together with, um,"},{"from":2739.14,"to":2743.79,"location":2,"content":"something that you've calculated for the current time step and the way you count-"},{"from":2743.79,"to":2749.55,"location":2,"content":"calculate it for the current time step is exactly the same in both cases."},{"from":2749.55,"to":2753.38,"location":2,"content":"Whoops, sorry. Both cases again that you're calculating"},{"from":2753.38,"to":2758.13,"location":2,"content":"the current update using this sort of simple RNN equation."},{"from":2758.13,"to":2760.56,"location":2,"content":"So, those parts are exactly the same."},{"from":2760.56,"to":2764.31,"location":2,"content":"Um, but the LSTM is a little bit more complicated."},{"from":2764.31,"to":2767.31,"location":2,"content":"It now has three gates, um,"},{"from":2767.31,"to":2768.8,"location":2,"content":"and it's got this extra, um,"},{"from":2768.8,"to":2772.5,"location":2,"content":"hidden state that's then worked out with a bit more complexity."},{"from":2772.5,"to":2777.17,"location":2,"content":"So, in terms of my LSTM picture, you know,"},{"from":2777.17,"to":2782.36,"location":2,"content":"the LSTM picture looks as if you sort of pull apart all of its math pretty"},{"from":2782.36,"to":2789.99,"location":2,"content":"complex but so there are three gates so that you can forget or ignore everything."},{"from":2789.99,"to":2792.03,"location":2,"content":"So, you can forget or ignore the input,"},{"from":2792.03,"to":2793.89,"location":2,"content":"you can forget or ignore parts of"},{"from":2793.89,"to":2798.75,"location":2,"content":"your previous hidden state and you can forget or ignore parts of the cell"},{"from":2798.75,"to":2802.07,"location":2,"content":"when calculating the output and each of these"},{"from":2802.07,"to":2806.14,"location":2,"content":"is produce- when I say forget or ignore parts of,"},{"from":2806.14,"to":2810.63,"location":2,"content":"what that's meaning is you're calculating a vector which is then going to be element-wise"},{"from":2810.63,"to":2816.07,"location":2,"content":"multiplied by the import of the previous hidden state or the cell."},{"from":2816.07,"to":2819.27,"location":2,"content":"And so, that's why you have this effective now an addressable bank of"},{"from":2819.27,"to":2823.34,"location":2,"content":"registers where you can use some of them but not others of them."},{"from":2823.34,"to":2826.78,"location":2,"content":"Okay. So, the bottom part of the LSTM is just"},{"from":2826.78,"to":2830.4,"location":2,"content":"like a simpler simple recurrent neural network,"},{"from":2830.4,"to":2832.82,"location":2,"content":"um, which then calculates,"},{"from":2832.82,"to":2835.13,"location":2,"content":"um, a candidate update."},{"from":2835.13,"to":2841.29,"location":2,"content":"And so, for both of the GRU and the LSTM the real secret is"},{"from":2841.29,"to":2844.14,"location":2,"content":"that rather than just keeping on multiplying"},{"from":2844.14,"to":2848.03,"location":2,"content":"stuff what you do is you add two things together."},{"from":2848.03,"to":2852.12,"location":2,"content":"Um, and so this adding is why you don't"},{"from":2852.12,"to":2856.05,"location":2,"content":"get the same vanishing gradient evil effects because you're calculating a"},{"from":2856.05,"to":2859.32,"location":2,"content":"new candidate update and you're adding it to stuff that was"},{"from":2859.32,"to":2862.66,"location":2,"content":"previously in the cell and that gives you"},{"from":2862.66,"to":2866.19,"location":2,"content":"a simple gradient when you backpropagate that- that you have"},{"from":2866.19,"to":2872.74,"location":2,"content":"direct linear connection between the cell at time t and the cell at time t minus one."},{"from":2872.74,"to":2876.24,"location":2,"content":"And so, really that simple addition there is sort of"},{"from":2876.24,"to":2880.35,"location":2,"content":"the secret of most of the power of LSTMs and"},{"from":2880.35,"to":2884.01,"location":2,"content":"this same idea of adding two things together has also been a"},{"from":2884.01,"to":2888.11,"location":2,"content":"secret of many of the other advances in deep learning recently."},{"from":2888.11,"to":2892.45,"location":2,"content":"So, envision in the last couple of years the sort of standard model"},{"from":2892.45,"to":2897.06,"location":2,"content":"that everybody uses as ResNets, residual networks and they use"},{"from":2897.06,"to":2903,"location":2,"content":"exactly the same secret of allowing these adaptive updates where you add"},{"from":2903,"to":2910.68,"location":2,"content":"together a current layer's value with directly inheriting a value from the layer below."},{"from":2910.68,"to":2915.06,"location":2,"content":"Um, other things that use similar ideas are things like highway networks and so on."},{"from":2915.06,"to":2919.05,"location":2,"content":"So, that's proven to be an extremely powerful idea."},{"from":2919.05,"to":2922.44,"location":2,"content":"Um, the LSTM is slightly different from"},{"from":2922.44,"to":2926.51,"location":2,"content":"the GRU because when we look back at its equations"},{"from":2926.51,"to":2933.99,"location":2,"content":"that the- the GRU kind of does a linear mixture where you have one gate value,"},{"from":2933.99,"to":2937.55,"location":2,"content":"UT, and one minus UT,"},{"from":2937.55,"to":2942.87,"location":2,"content":"where the LSTM adds values controlled by two different gates,"},{"from":2942.87,"to":2945.61,"location":2,"content":"a forget gate, and an input gate."},{"from":2945.61,"to":2949.29,"location":2,"content":"Theoretically, having the adding of"},{"from":2949.29,"to":2953.94,"location":2,"content":"two separate gates rather than than a mixture is theoretically more powerful."},{"from":2953.94,"to":2956.55,"location":2,"content":"Um, depending on the application,"},{"from":2956.55,"to":2959.37,"location":2,"content":"sometimes it doesn't seem to make much difference, um,"},{"from":2959.37,"to":2963.48,"location":2,"content":"but there's definitely a theoretical advantage to the LSTM there."},{"from":2963.48,"to":2971.07,"location":2,"content":"Okay. Um, just, I hope that's maybe a little bit more helpful to have seen those again,"},{"from":2971.07,"to":2977.97,"location":2,"content":"um, any questions on gated recurrent units?"},{"from":2977.97,"to":2982.65,"location":2,"content":"Still look confusing?"},{"from":2982.65,"to":2988.45,"location":2,"content":"I think it's useful to have some kind of idea as to why the people come up with"},{"from":2988.45,"to":2993.67,"location":2,"content":"these things and why do they make sense but,"},{"from":2993.67,"to":2998.68,"location":2,"content":"you know, nevertheless, the reality is in the sort of era of"},{"from":2998.68,"to":3003.75,"location":2,"content":"2015 plus any deep learning package you use whether it's PyTorch,"},{"from":3003.75,"to":3005.94,"location":2,"content":"TensorFlow, MXNet whatever, you know,"},{"from":3005.94,"to":3011.25,"location":2,"content":"it just comes with LSTM and GRUs and you don't have to program your own."},{"from":3011.25,"to":3013.17,"location":2,"content":"In fact, you're at disadvantage if you"},{"from":3013.17,"to":3016.02,"location":2,"content":"program your own because if you are using the built-in one,"},{"from":3016.02,"to":3019.07,"location":2,"content":"it's using an efficient CUDA kernel from"},{"from":3019.07,"to":3023.91,"location":2,"content":"Nvidia whereas your custom built one won't and/or run three times slower."},{"from":3023.91,"to":3026.91,"location":2,"content":"Um, so, you know, essentially don't have to know how to do it,"},{"from":3026.91,"to":3030.53,"location":2,"content":"you can just take the attitude that an LSTM is just like"},{"from":3030.53,"to":3035.34,"location":2,"content":"a fancy recurrent network which will be easier to train and that's true."},{"from":3035.34,"to":3039.62,"location":2,"content":"Um, but you know, these kind of architectural ideas have actually been"},{"from":3039.62,"to":3045.42,"location":2,"content":"central to most of the big advances that have come in deep learning in the last couple of years,"},{"from":3045.42,"to":3047.64,"location":2,"content":"so there's actually good to have an ID,"},{"from":3047.64,"to":3049.92,"location":2,"content":"to have some sense of what were"},{"from":3049.92,"to":3053.68,"location":2,"content":"these important ideas that made everything so much better because they had"},{"from":3053.68,"to":3056.85,"location":2,"content":"the same kind of component building blocks you might also want"},{"from":3056.85,"to":3062.12,"location":2,"content":"to use in custom models that you design for yourself."},{"from":3062.12,"to":3066.84,"location":2,"content":"Okay, two bits of machine translation."},{"from":3066.84,"to":3071.25,"location":2,"content":"Um, so a bit of machine translation that we"},{"from":3071.25,"to":3075.72,"location":2,"content":"sort of didn't cover next week but lots of people have been seeing"},{"from":3075.72,"to":3079.92,"location":2,"content":"and getting confused by in the assignments so I thought I'd explain"},{"from":3079.92,"to":3084.21,"location":2,"content":"a bit about is UNKs and explain where do UNKs"},{"from":3084.21,"to":3088.41,"location":2,"content":"come from and why are there UNKs and the reason why"},{"from":3088.41,"to":3093.07,"location":2,"content":"there are UNKs is effectively kind of for efficiency reasons."},{"from":3093.07,"to":3099.7,"location":2,"content":"So, if you sort of think about producing output in a neural machine translation system"},{"from":3099.7,"to":3103.17,"location":2,"content":"and really this is the same as producing output"},{"from":3103.17,"to":3106.68,"location":2,"content":"in any natural, neural natural language generation system,"},{"from":3106.68,"to":3109.78,"location":2,"content":"so that's really the same for neural language model, um,"},{"from":3109.78,"to":3116.97,"location":2,"content":"that if you have a very large output vocabulary is just a expensive operation."},{"from":3116.97,"to":3124.85,"location":2,"content":"So you have a big matrix of softmax parameters where you have a row for every word, um,"},{"from":3124.85,"to":3132.42,"location":2,"content":"and then you have what,"},{"from":3132.42,"to":3135.33,"location":2,"content":"[NOISE] then we have an animation that is not working for me."},{"from":3135.33,"to":3138.21,"location":2,"content":"Oh, all right there, there we go."},{"from":3138.21,"to":3141.03,"location":2,"content":"Um, so then we have some hidden state that we've"},{"from":3141.03,"to":3145.34,"location":2,"content":"calculated in our recurrent neural network."},{"from":3145.34,"to":3149.99,"location":2,"content":"And so, what we gonna do is sort of multiply, um,"},{"from":3149.99,"to":3153.11,"location":2,"content":"that vector by every row of the matrix,"},{"from":3153.11,"to":3159.03,"location":2,"content":"put it through a softmax and then get probabilities without putting every word."},{"from":3159.03,"to":3160.77,"location":2,"content":"Um, and you know,"},{"from":3160.77,"to":3164.04,"location":2,"content":"this seems pretty simple but the problem is that"},{"from":3164.04,"to":3167.4,"location":2,"content":"to the extent that you have a humongous vocabulary here,"},{"from":3167.4,"to":3171.24,"location":2,"content":"you just have to do a humongous number of rows"},{"from":3171.24,"to":3175.18,"location":2,"content":"of this multiplication and it actually turns out that"},{"from":3175.18,"to":3179.03,"location":2,"content":"doing this is the expensive part of"},{"from":3179.03,"to":3183.6,"location":2,"content":"having a neural machine translation or neural language model system, right?"},{"from":3183.6,"to":3187.38,"location":2,"content":"The LSTM might look complicated and hard to understand, but you know,"},{"from":3187.38,"to":3191.94,"location":2,"content":"it's relatively small vectors that you multiply or dot-product once,"},{"from":3191.94,"to":3196.02,"location":2,"content":"and it's not that much work whereas if you have a huge number of words,"},{"from":3196.02,"to":3197.43,"location":2,"content":"this is a huge amount of work."},{"from":3197.43,"to":3202.56,"location":2,"content":"So, just for instance sort of for the pion- pioneering sequence to sequence,"},{"from":3202.56,"to":3206.36,"location":2,"content":"um, neural machine translation system that Google first did,"},{"from":3206.36,"to":3210.84,"location":2,"content":"they ran it on an eight GPU machine because they have lots of GPUs but"},{"from":3210.84,"to":3216.07,"location":2,"content":"the way they set it up to maximize performance was of those eight GPUs,"},{"from":3216.07,"to":3218.49,"location":2,"content":"three of them were running"},{"from":3218.49,"to":3224.07,"location":2,"content":"a deep multi-layer neural sequence model and the other five GPUs,"},{"from":3224.07,"to":3227.97,"location":2,"content":"the only thing that they were doing was calculating softmaxes because that's"},{"from":3227.97,"to":3232.77,"location":2,"content":"actually the bulk of the computation that you need to be able to do."},{"from":3232.77,"to":3236.85,"location":2,"content":"Um, so the simplest way to make this, um,"},{"from":3236.85,"to":3241.56,"location":2,"content":"computation not completely excessive is to say,"},{"from":3241.56,"to":3243.93,"location":2,"content":"\"Hey, I'll just limit the vocabulary.\""},{"from":3243.93,"to":3247.36,"location":2,"content":"Yeah I know that you can make"},{"from":3247.36,"to":3253.23,"location":2,"content":"a million different words in English and if you look at Spanish inflections of verbs,"},{"from":3253.23,"to":3256.24,"location":2,"content":"there are a lot of them and there's gonna be huge number of words, um,"},{"from":3256.24,"to":3260.22,"location":2,"content":"but maybe I can just make do with a modest vocabulary and it'll be near enough."},{"from":3260.22,"to":3262.3,"location":2,"content":"Surely 50,000 common words,"},{"from":3262.3,"to":3265.24,"location":2,"content":"I can cover a lot of stuff and so,"},{"from":3265.24,"to":3269.58,"location":2,"content":"that was sort of the starting off point of neural machine translation that you,"},{"from":3269.58,"to":3274.51,"location":2,"content":"people use the modest vocabulary like around 50,000 words."},{"from":3274.51,"to":3276.91,"location":2,"content":"And well, if you do that, um,"},{"from":3276.91,"to":3280.98,"location":2,"content":"well, then what happens is you have UNKs."},{"from":3280.98,"to":3283.26,"location":2,"content":"So UNK means, this is an unknown word,"},{"from":3283.26,"to":3287.32,"location":2,"content":"that's not in my vocabulary and so there are two kinds of UNKs,"},{"from":3287.32,"to":3291.32,"location":2,"content":"they can be UNKs in the source language and you know,"},{"from":3291.32,"to":3295.71,"location":2,"content":"they're sort of optional because, you know,"},{"from":3295.71,"to":3299.47,"location":2,"content":"it's not actually a problem having a large source language vocabulary,"},{"from":3299.47,"to":3302.07,"location":2,"content":"but the fact of the matter is if you've sort of trained"},{"from":3302.07,"to":3304.62,"location":2,"content":"a model on a certain amount of data,"},{"from":3304.62,"to":3306.72,"location":2,"content":"there are some words you aren't going to have seen,"},{"from":3306.72,"to":3309,"location":2,"content":"so you are going to have words that you just didn't"},{"from":3309,"to":3311.52,"location":2,"content":"see in your training data and you won't have"},{"from":3311.52,"to":3314.43,"location":2,"content":"any pre-trained or trained word vector"},{"from":3314.43,"to":3317.76,"location":2,"content":"for them and you can deal with that by either just treating them as UNK,"},{"from":3317.76,"to":3320.59,"location":2,"content":"so giving them a new word vector when you encounter them."},{"from":3320.59,"to":3324.57,"location":2,"content":"But the tricky part is on the translation that you're wanting to"},{"from":3324.57,"to":3328.72,"location":2,"content":"produce these rare words but they're not in your output vocabulary,"},{"from":3328.72,"to":3335.55,"location":2,"content":"so your system is producing UNK, UNK to UNK, which is not a very good translation really."},{"from":3335.55,"to":3339.72,"location":2,"content":"Um, yeah, and so that was sort of what the first,"},{"from":3339.72,"to":3344.22,"location":2,"content":"um, machine, neural machine translation systems, um, did."},{"from":3344.22,"to":3346.26,"location":2,"content":"And so, you know, obviously that's not"},{"from":3346.26,"to":3351.55,"location":2,"content":"a very satisfactory state of affairs and so there's been a whole bunch of work,"},{"from":3351.55,"to":3353.22,"location":2,"content":"um, as to how to deal with this,"},{"from":3353.22,"to":3360.47,"location":2,"content":"so you can use methods that allow you to deal with a larger output vocabulary,"},{"from":3360.47,"to":3363.78,"location":2,"content":"um, without the computation being excessive."},{"from":3363.78,"to":3367.78,"location":2,"content":"So one method of doing that is to have what's called a hierarchical softmax,"},{"from":3367.78,"to":3371.51,"location":2,"content":"so that rather than just having a huge matrix of words,"},{"from":3371.51,"to":3374.91,"location":2,"content":"you sort of have a tree structure in your vocabulary"},{"from":3374.91,"to":3378.48,"location":2,"content":"so you can do calculations with hierarchical,"},{"from":3378.48,"to":3382.82,"location":2,"content":"um, multiple small softmaxes and you can do that more quickly."},{"from":3382.82,"to":3385.62,"location":2,"content":"Um, I'm not gonna go through all these exam,"},{"from":3385.62,"to":3387.27,"location":2,"content":"all these things in detail now,"},{"from":3387.27,"to":3391.57,"location":2,"content":"I'm just sort of very quickly mentioning them and if anyone's interested, they can look."},{"from":3391.57,"to":3394.83,"location":2,"content":"People have used the noise-contrastive estimation idea that we"},{"from":3394.83,"to":3398.24,"location":2,"content":"saw with Word2vec in this context as well."},{"from":3398.24,"to":3402.66,"location":2,"content":"So this is a way to get much faster training which is important,"},{"from":3402.66,"to":3405.32,"location":2,"content":"it's not really a way to solve, um,"},{"from":3405.32,"to":3407.79,"location":2,"content":"speed at translation time but, you know,"},{"from":3407.79,"to":3410.58,"location":2,"content":"if this means you can train your system in six hours instead of"},{"from":3410.58,"to":3415.16,"location":2,"content":"six days that's a big win and so that's a good technique to use."},{"from":3415.16,"to":3420.33,"location":2,"content":"Um, people have done much smarter things, so really, um,"},{"from":3420.33,"to":3423.75,"location":2,"content":"the large vocabulary problem is basically solved"},{"from":3423.75,"to":3427.65,"location":2,"content":"now and so the kind of things that you can do is you can produce"},{"from":3427.65,"to":3431.97,"location":2,"content":"subsets of your vocabulary and train on particular subsets of"},{"from":3431.97,"to":3436.38,"location":2,"content":"vocabulary at a time and then when you're testing,"},{"from":3436.38,"to":3440.82,"location":2,"content":"you adaptively choose kind of a likely list of words that might"},{"from":3440.82,"to":3445.29,"location":2,"content":"appear in the translation of particular sentences or passages and then"},{"from":3445.29,"to":3448.2,"location":2,"content":"you can effectively work with sort of an appropriate subset of"},{"from":3448.2,"to":3452.85,"location":2,"content":"a vocabulary and that's sort of an efficient technique by which you can"},{"from":3452.85,"to":3456.33,"location":2,"content":"deal with an unlimited vocabulary but only be using"},{"from":3456.33,"to":3461.95,"location":2,"content":"a moderate sized softmax for any particular paragraph that you're translating,"},{"from":3461.95,"to":3464.79,"location":2,"content":"there's a paper that talks about that method."},{"from":3464.79,"to":3469.43,"location":2,"content":"Um, another idea is you can use attention when you do translation,"},{"from":3469.43,"to":3471.93,"location":2,"content":"the idea talked about at the end of last time."},{"from":3471.93,"to":3475.09,"location":2,"content":"So if you have attention, that sort of means that you can,"},{"from":3475.09,"to":3477.66,"location":2,"content":"you're pointing somewhere in the source and you"},{"from":3477.66,"to":3480.66,"location":2,"content":"know what you're translating at any point in time."},{"from":3480.66,"to":3485.07,"location":2,"content":"So, if that word is a rare word that's not in your vocabulary,"},{"from":3485.07,"to":3487.56,"location":2,"content":"there are things that you could do to deal with that."},{"from":3487.56,"to":3489.93,"location":2,"content":"I mean, firstly, if it's a rare word,"},{"from":3489.93,"to":3493.14,"location":2,"content":"its translation is much more likely to be constant,"},{"from":3493.14,"to":3497.47,"location":2,"content":"so you might just look it up in a dictionary or word list, um, and,"},{"from":3497.47,"to":3499.83,"location":2,"content":"um, stick in its translation,"},{"from":3499.83,"to":3502.49,"location":2,"content":"sometimes it's appropriate to do other things."},{"from":3502.49,"to":3504.45,"location":2,"content":"I mean, turns out that, you know,"},{"from":3504.45,"to":3509.68,"location":2,"content":"quite a lot of things that unknown words turn out to be other things like, you know,"},{"from":3509.68,"to":3512.97,"location":2,"content":"hexadecimal numbers, or FedEx tracking IDs,"},{"from":3512.97,"to":3515.68,"location":2,"content":"or GitHub shards, or things like that."},{"from":3515.68,"to":3517.02,"location":2,"content":"So for a lot of things like that,"},{"from":3517.02,"to":3519.39,"location":2,"content":"the right thing to do is just to copy them across."},{"from":3519.39,"to":3522.66,"location":2,"content":"And so, another thing that people have looked at is copying models,"},{"from":3522.66,"to":3525.22,"location":2,"content":"um, in machine translation."},{"from":3525.22,"to":3528.22,"location":2,"content":"Okay, um, there are more ideas that you can,"},{"from":3528.22,"to":3530.84,"location":2,"content":"we can get into to solve this and actually, um,"},{"from":3530.84,"to":3532.79,"location":2,"content":"next week we're gonna start dealing with"},{"from":3532.79,"to":3535.09,"location":2,"content":"some of the other ways that you could solve this, um,"},{"from":3535.09,"to":3539.41,"location":2,"content":"but I hope there to have given you sort of a sense of,"},{"from":3539.41,"to":3541.8,"location":2,"content":"um, sort of what these UNKs are about,"},{"from":3541.8,"to":3543.64,"location":2,"content":"why you see them and, uh,"},{"from":3543.64,"to":3546.14,"location":2,"content":"that there are sort of some ways that you might"},{"from":3546.14,"to":3548.6,"location":2,"content":"deal with them but you're not expected to be doing that,"},{"from":3548.6,"to":3550.9,"location":2,"content":"um, for assignment four."},{"from":3550.9,"to":3556.68,"location":2,"content":"Okay, then I just wanted to give a teeny bit more on evaluation."},{"from":3556.68,"to":3559.51,"location":2,"content":"Um, so Abby said a little bit about"},{"from":3559.51,"to":3563.37,"location":2,"content":"evaluation with blue and that then comes up in the assignment,"},{"from":3563.37,"to":3566.13,"location":2,"content":"so I just thought I'd give you a little bit more context on"},{"from":3566.13,"to":3569.09,"location":2,"content":"that since they're being quite a few questions about it."},{"from":3569.09,"to":3573.05,"location":2,"content":"So, um, so the general context here is, you know,"},{"from":3573.05,"to":3578.89,"location":2,"content":"how do you evaluate machine translation quality and sort of to this day,"},{"from":3578.89,"to":3583.98,"location":2,"content":"if you wanted to do a first rate bang up evaluation of machine translation quality,"},{"from":3583.98,"to":3587.67,"location":2,"content":"the way you do it is you get human beings to assess quality,"},{"from":3587.67,"to":3590.84,"location":2,"content":"you take translations and you send them to"},{"from":3590.84,"to":3594.87,"location":2,"content":"human beings with good bilingual skills and get them to score things."},{"from":3594.87,"to":3597.26,"location":2,"content":"And there are two ways that are commonly used."},{"from":3597.26,"to":3599.55,"location":2,"content":"One is sort of rating on"},{"from":3599.55,"to":3604.29,"location":2,"content":"Likert scales for things like adequacy and fluency of translations,"},{"from":3604.29,"to":3609.03,"location":2,"content":"um, but another way that often works better is asking for comparative judgments."},{"from":3609.03,"to":3614.03,"location":2,"content":"So here are two translations of this sentence which is better, um."},{"from":3614.03,"to":3616.94,"location":2,"content":"And so that's, you know,"},{"from":3616.94,"to":3620.07,"location":2,"content":"sort of still our gold standard of translation."},{"from":3620.07,"to":3622.88,"location":2,"content":"Um, another way you can evaluate translation is"},{"from":3622.88,"to":3625.93,"location":2,"content":"use your translations in the downstream task."},{"from":3625.93,"to":3628.64,"location":2,"content":"So, you could say \"I'm gonna build"},{"from":3628.64,"to":3633.5,"location":2,"content":"a cross-lingual question answering system and inside that system I'm,"},{"from":3633.5,"to":3635.78,"location":2,"content":"gonna use machine translation."},{"from":3635.78,"to":3637.97,"location":2,"content":"I'm gonna translate the questions um,"},{"from":3637.97,"to":3640.63,"location":2,"content":"and then try and match them against the documents."},{"from":3640.63,"to":3645.83,"location":2,"content":"Um, and then my score will be how good my question answering system is,"},{"from":3645.83,"to":3648.8,"location":2,"content":"and so the machine translation system is better"},{"from":3648.8,"to":3652.19,"location":2,"content":"if my question-answering score um, goes up.\""},{"from":3652.19,"to":3657.24,"location":2,"content":"I mean, that's kind of a nice way to do things because you're kinda then taking them in, run around needing,"},{"from":3657.24,"to":3660.11,"location":2,"content":"needing human beings, and yet you do have"},{"from":3660.11,"to":3663.49,"location":2,"content":"a clear numerical measure that's coming out the back end."},{"from":3663.49,"to":3666.55,"location":2,"content":"But it sort of has some catches because, you know,"},{"from":3666.55,"to":3669.98,"location":2,"content":"often there will be a fairly indirect connection between"},{"from":3669.98,"to":3674.09,"location":2,"content":"your end task and the quality of the machine translation,"},{"from":3674.09,"to":3676.64,"location":2,"content":"and it might turn out that there certain aspects of"},{"from":3676.64,"to":3680.51,"location":2,"content":"the machine translation like whether you get agreement endings,"},{"from":3680.51,"to":3682.97,"location":2,"content":"right on nouns and verbs or something."},{"from":3682.97,"to":3686.12,"location":2,"content":"They are actually just irrelevant to your performance in the task and say you're"},{"from":3686.12,"to":3689.64,"location":2,"content":"not assessing all aspects of um, quality."},{"from":3689.64,"to":3692.81,"location":2,"content":"Um, and so then the third way to do it is to come up with"},{"from":3692.81,"to":3695.84,"location":2,"content":"some way to score the direct tasks."},{"from":3695.84,"to":3700.41,"location":2,"content":"So, here, um, the direct task is machine translation,"},{"from":3700.41,"to":3704.45,"location":2,"content":"and this has been a valuable tool."},{"from":3704.45,"to":3707.3,"location":2,"content":"For, you know, really the last so"},{"from":3707.3,"to":3711.29,"location":2,"content":"25 years when people are doing machine learning models,"},{"from":3711.29,"to":3715.1,"location":2,"content":"because as soon as you have an automatic way to score things,"},{"from":3715.1,"to":3722.06,"location":2,"content":"you can then run automated experiments to say \"Let me try out these 50 different options."},{"from":3722.06,"to":3727.25,"location":2,"content":"Let me start varying these hyper-parameters and work out which way to do things is best.\""},{"from":3727.25,"to":3730.76,"location":2,"content":"And that importance has only grown in the deep learning era,"},{"from":3730.76,"to":3735.2,"location":2,"content":"when all the time what we want you to do is as Abby discussed, um,"},{"from":3735.2,"to":3738.14,"location":2,"content":"build end-to-end systems and then back"},{"from":3738.14,"to":3741.2,"location":2,"content":"propagate throughout the entire system to improve them,"},{"from":3741.2,"to":3742.91,"location":2,"content":"and we're doing that based on having"},{"from":3742.91,"to":3746.47,"location":2,"content":"some objective measure which is our automatic metric."},{"from":3746.47,"to":3749.41,"location":2,"content":"And so, that led into the development of"},{"from":3749.41,"to":3753.36,"location":2,"content":"automatic metrics to try and assess machine translation quality,"},{"from":3753.36,"to":3758.14,"location":2,"content":"and the most famous and still most used one is this one called BLEU."},{"from":3758.14,"to":3761.38,"location":2,"content":"And so, as Abby briefly mentioned,"},{"from":3761.38,"to":3764.9,"location":2,"content":"we have a reference translation done by human beings."},{"from":3764.9,"to":3769.79,"location":2,"content":"At some time a human being has to translate each piece of source material once,"},{"from":3769.79,"to":3773.18,"location":2,"content":"but then you take a machine translation and you"},{"from":3773.18,"to":3777.32,"location":2,"content":"score it based on the extent to which there"},{"from":3777.32,"to":3780.92,"location":2,"content":"are one or more word sequences that appear in"},{"from":3780.92,"to":3786.07,"location":2,"content":"the reference translation and also appear in the machine translation."},{"from":3786.07,"to":3792.53,"location":2,"content":"And so you are working out n-gram preci-precision scores for different values of n. So,"},{"from":3792.53,"to":3796.01,"location":2,"content":"the standard way of doing it is you do it for one grams,"},{"from":3796.01,"to":3798.56,"location":2,"content":"bigrams, trigrams, and four-grams."},{"from":3798.56,"to":3801.39,"location":2,"content":"So, word sequences of size one to four,"},{"from":3801.39,"to":3806.27,"location":2,"content":"and you try and find for ones of those in the machine translation,"},{"from":3806.27,"to":3811.76,"location":2,"content":"whether they also appear in the reference translation,"},{"from":3811.76,"to":3814.41,"location":2,"content":"and there are two tricks at work here."},{"from":3814.41,"to":3819.51,"location":2,"content":"Um, one trick is you have to do a kind of a bipartite matching um,"},{"from":3819.51,"to":3822.66,"location":2,"content":"because it just can't be that um,"},{"from":3822.66,"to":3825.18,"location":2,"content":"there's a word um,"},{"from":3825.18,"to":3829.55,"location":2,"content":"in the, in the reference translation somewhere."},{"from":3829.55,"to":3831.23,"location":2,"content":"Um, [NOISE] I don't know if there's."},{"from":3831.23,"to":3833.51,"location":2,"content":"I've got a good example here [NOISE]."},{"from":3833.51,"to":3837.77,"location":2,"content":"Um, maybe I can only do a silly example,"},{"from":3837.77,"to":3839.55,"location":2,"content":"but I'll do a silly example."},{"from":3839.55,"to":3843.32,"location":2,"content":"Um, that it's- it doesn't seem like you wanna say \"Okay."},{"from":3843.32,"to":3845.42,"location":2,"content":"Because there's a \"the\" in the reference,"},{"from":3845.42,"to":3848.97,"location":2,"content":"that means that this \"the\" is right and this \"the\" is right,"},{"from":3848.97,"to":3852.8,"location":2,"content":"and this \"the\" is right and every other \"the\" is also right.\""},{"from":3852.8,"to":3854.49,"location":2,"content":"That sort of seems unfair."},{"from":3854.49,"to":3860.82,"location":2,"content":"So, you're only allowed to use each thing in the reference once in matching n-grams,"},{"from":3860.82,"to":3864.14,"location":2,"content":"but you are allowed to use it multiple times for different order n-grams."},{"from":3864.14,"to":3866.57,"location":2,"content":"So, you can use it both in the uh unigram,"},{"from":3866.57,"to":3868.99,"location":2,"content":"bigram, trigram and 4-gram."},{"from":3868.99,"to":3872.27,"location":2,"content":"The other idea is that although you're measuring"},{"from":3872.27,"to":3877.2,"location":2,"content":"the precision of n-grams that are in the machine translation,"},{"from":3877.2,"to":3879.86,"location":2,"content":"you wouldn't want people to be able to cheat by"},{"from":3879.86,"to":3882.71,"location":2,"content":"putting almost nothing into the machine translation."},{"from":3882.71,"to":3887.45,"location":2,"content":"So, you might wanna game it by no matter what the source document is."},{"from":3887.45,"to":3889.52,"location":2,"content":"If the target language is English,"},{"from":3889.52,"to":3891.11,"location":2,"content":"you could just um say,"},{"from":3891.11,"to":3892.79,"location":2,"content":"\"My translation is the,"},{"from":3892.79,"to":3895.49,"location":2,"content":"because I'm pretty sure that will be in"},{"from":3895.49,"to":3899.32,"location":2,"content":"the reference translation somewhere and I'll get 0.3 unigram,"},{"from":3899.32,"to":3902.84,"location":2,"content":"and that's not great but I'll get something for that and I am done.\""},{"from":3902.84,"to":3904.88,"location":2,"content":"And so you wouldn't want that and so,"},{"from":3904.88,"to":3908.87,"location":2,"content":"you're then being penalized by something called the brevity penalty if"},{"from":3908.87,"to":3914.04,"location":2,"content":"your translation is shorter than the reference translation,"},{"from":3914.04,"to":3918.37,"location":2,"content":"and so this BLEU metric is um forming"},{"from":3918.37,"to":3924.28,"location":2,"content":"a geometric average of n-gram precision up to some n. Normally,"},{"from":3924.28,"to":3925.3,"location":2,"content":"it's sort of up to four,"},{"from":3925.3,"to":3926.49,"location":2,"content":"is how it's done."},{"from":3926.49,"to":3929,"location":2,"content":"Where it's a weighted geometric average,"},{"from":3929,"to":3932.41,"location":2,"content":"where you're putting weights on the different n-grams."},{"from":3932.41,"to":3935.87,"location":2,"content":"Um, for the assignment, we're only using unigrams and bigrams."},{"from":3935.87,"to":3939.45,"location":2,"content":"So, you could say that means we're putting a weight of zero on um,"},{"from":3939.45,"to":3942.65,"location":2,"content":"the trigrams and 4-grams."},{"from":3942.65,"to":3946.24,"location":2,"content":"Okay. Um, and so that's basically what we're doing."},{"from":3946.24,"to":3949.28,"location":2,"content":"I-I've just mentioned um couple of other things."},{"from":3949.28,"to":3951.84,"location":2,"content":"You might think that this is kind of random,"},{"from":3951.84,"to":3953.78,"location":2,"content":"and so people have um,"},{"from":3953.78,"to":3957.53,"location":2,"content":"used this idea of rather than just having one reference translation,"},{"from":3957.53,"to":3960.08,"location":2,"content":"we could have multiple reference translations,"},{"from":3960.08,"to":3962.72,"location":2,"content":"because that way we can allow for there being"},{"from":3962.72,"to":3965.54,"location":2,"content":"variation and good ways of translating things,"},{"from":3965.54,"to":3969.74,"location":2,"content":"because in language there's always lots of good ways that you can translate one sentence."},{"from":3969.74,"to":3972.43,"location":2,"content":"Um, people have done that quite a bit,"},{"from":3972.43,"to":3976.82,"location":2,"content":"but people have also decided that even if you have one translation,"},{"from":3976.82,"to":3980.99,"location":2,"content":"provided it's independent and on a kind of statistical basis,"},{"from":3980.99,"to":3985.34,"location":2,"content":"you're still more likely to match it if your translation is a good translation."},{"from":3985.34,"to":3987.56,"location":2,"content":"So, it's probably okay."},{"from":3987.56,"to":3992.93,"location":2,"content":"Um, so when BLEU was originally um, introduced,"},{"from":3992.93,"to":3997.37,"location":2,"content":"BLEU seemed marvelous and people drew graphs like this showing how"},{"from":3997.37,"to":4001.91,"location":2,"content":"closely BLEU scores correlated um,"},{"from":4001.91,"to":4005.61,"location":2,"content":"with human judgments of translation quality."},{"from":4005.61,"to":4008.71,"location":2,"content":"However, um, like a lot of things in life,"},{"from":4008.71,"to":4010.9,"location":2,"content":"there are a lot of things that are great measures,"},{"from":4010.9,"to":4013.87,"location":2,"content":"providing people aren't directly trying to optimize it,"},{"from":4013.87,"to":4016.72,"location":2,"content":"and so what's happened since then um,"},{"from":4016.72,"to":4020.62,"location":2,"content":"is that everybody has been trying to optimize BLEU scores,"},{"from":4020.62,"to":4026.38,"location":2,"content":"and the result of that is that BLEU scores have gone up massively but the correlation"},{"from":4026.38,"to":4028.54,"location":2,"content":"between BLEU scores and human judgments of"},{"from":4028.54,"to":4032.18,"location":2,"content":"translation in quality have gone down massively,"},{"from":4032.18,"to":4036.55,"location":2,"content":"and so we're in this current state that um, the BLEU scores,"},{"from":4036.55,"to":4042.64,"location":2,"content":"the machines, um are pretty near the scores of human translations."},{"from":4042.64,"to":4044.8,"location":2,"content":"So, you know, according to BLEU scores,"},{"from":4044.8,"to":4048.57,"location":2,"content":"we're producing almost human quality machine translation,"},{"from":4048.57,"to":4052.69,"location":2,"content":"but if you actually look at the real quality of the translations,"},{"from":4052.69,"to":4054.1,"location":2,"content":"they're still well behind"},{"from":4054.1,"to":4059.56,"location":2,"content":"human beings um and because you could say the metric is being gamed."},{"from":4059.56,"to":4065.95,"location":2,"content":"Okay. I'll hope those things help for giving more sense um for assignment four."},{"from":4065.95,"to":4068.26,"location":2,"content":"Um, so now for the last um,"},{"from":4068.26,"to":4070.14,"location":2,"content":"about 12 minutes, um,"},{"from":4070.14,"to":4071.5,"location":2,"content":"I just now wanna um,"},{"from":4071.5,"to":4078.16,"location":2,"content":"return to um final projects and say a little bit more um about final projects."},{"from":4078.16,"to":4081.2,"location":2,"content":"Um so, there many,"},{"from":4081.2,"to":4083.71,"location":2,"content":"many different ways you can do final projects,"},{"from":4083.71,"to":4086.29,"location":2,"content":"but just to sort of go through the steps."},{"from":4086.29,"to":4089.17,"location":2,"content":"I mean, you know, for a simple straightforward project,"},{"from":4089.17,"to":4091.51,"location":2,"content":"this is kind of the steps that you want to go through."},{"from":4091.51,"to":4093.3,"location":2,"content":"So, you choose some tasks,"},{"from":4093.3,"to":4097.38,"location":2,"content":"summarizing text um, producing a shorter version of a text."},{"from":4097.38,"to":4100.18,"location":2,"content":"You work out some dataset that you can use."},{"from":4100.18,"to":4102.97,"location":2,"content":"So, this is an example of the kind of tasks that there"},{"from":4102.97,"to":4106.02,"location":2,"content":"are academic data sets for that other people have used,"},{"from":4106.02,"to":4108.25,"location":2,"content":"and so you could just use one of those,"},{"from":4108.25,"to":4111.73,"location":2,"content":"and that's it, you're already done or you could think \"Oh no!"},{"from":4111.73,"to":4113.35,"location":2,"content":"I'm much too creative for that."},{"from":4113.35,"to":4118.78,"location":2,"content":"I'm gonna come up with my own dataset [NOISE] um and get some online source and do it.\""},{"from":4118.78,"to":4120.37,"location":2,"content":"Um, and you know,"},{"from":4120.37,"to":4125.8,"location":2,"content":"summaries of the kind of things you can find online and produce your own dataset."},{"from":4125.8,"to":4128.81,"location":2,"content":"Um [NOISE] I wanna say a bit in,"},{"from":4128.81,"to":4130.4,"location":2,"content":"in just after this,"},{"from":4130.4,"to":4133.38,"location":2,"content":"about separating off um data sets for"},{"from":4133.38,"to":4136.86,"location":2,"content":"training and test data, so I'll delay that, but that's important."},{"from":4136.86,"to":4141.44,"location":2,"content":"Then, you want to work out a way to evaluate your um,"},{"from":4141.44,"to":4144.94,"location":2,"content":"system including an automatic evaluation."},{"from":4144.94,"to":4146.53,"location":2,"content":"Um, normally, for summarization,"},{"from":4146.53,"to":4148.51,"location":2,"content":"people use a slightly different metric called"},{"from":4148.51,"to":4152.34,"location":2,"content":"ROUGE but it's sort of related to BLEU hence its name."},{"from":4152.34,"to":4154.96,"location":2,"content":"Um, it's the same story that it sort of works,"},{"from":4154.96,"to":4157.16,"location":2,"content":"but human evaluation is much better."},{"from":4157.16,"to":4161.34,"location":2,"content":"Um, but you need- so you need to work out some metrics you can use for the project."},{"from":4161.34,"to":4165.53,"location":2,"content":"Um, the next thing you should do is establish a baseline."},{"from":4165.53,"to":4169.56,"location":2,"content":"So, if it's a well-worked on problem there might already be one,"},{"from":4169.56,"to":4173.17,"location":2,"content":"but it's not bad to try and calculate one for yourself anyway,"},{"from":4173.17,"to":4176.17,"location":2,"content":"and in particular what you should first have is"},{"from":4176.17,"to":4179.44,"location":2,"content":"a very simple model and see how well it works."},{"from":4179.44,"to":4182.15,"location":2,"content":"So, for human language material,"},{"from":4182.15,"to":4185.02,"location":2,"content":"often doing things like bag of words models,"},{"from":4185.02,"to":4188.05,"location":2,"content":"whether they're just a simple classifier over"},{"from":4188.05,"to":4192.54,"location":2,"content":"words or a new bag of words, averaging word vectors."},{"from":4192.54,"to":4196.99,"location":2,"content":"It's just useful to try that on the task and see how it works,"},{"from":4196.99,"to":4199.68,"location":2,"content":"see what kinds of things it already gets right,"},{"from":4199.68,"to":4201.82,"location":2,"content":"what kind of things it gets wrong."},{"from":4201.82,"to":4203.88,"location":2,"content":"You know, one possibility is you will find that"},{"from":4203.88,"to":4207.14,"location":2,"content":"a very simple model already does great on your task."},{"from":4207.14,"to":4208.57,"location":2,"content":"If that's the case, um,"},{"from":4208.57,"to":4210.27,"location":2,"content":"you have too easy a task,"},{"from":4210.27,"to":4216.46,"location":2,"content":"and you probably need to find a task that's more challenging to work on. Um, yes."},{"from":4216.46,"to":4220.09,"location":2,"content":"So after that, you'll then sort of think about what could be a good kind"},{"from":4220.09,"to":4223.93,"location":2,"content":"of neural network model that might do well, implement it,"},{"from":4223.93,"to":4228.64,"location":2,"content":"test it um, see what kind of errors that makes and you know,"},{"from":4228.64,"to":4230.55,"location":2,"content":"that's sort of if you've gotten that far,"},{"from":4230.55,"to":4233.6,"location":2,"content":"you're sort of in the right space for a class project."},{"from":4233.6,"to":4237.4,"location":2,"content":"But, you know, it's sort of hoped that you could do more than that."},{"from":4237.4,"to":4239.94,"location":2,"content":"But after you've seen the errors from the first version,"},{"from":4239.94,"to":4243.86,"location":2,"content":"you could think about how to make it better and come up with a better project,"},{"from":4243.86,"to":4246.06,"location":2,"content":"and so I would encourage everyone,"},{"from":4246.06,"to":4248.68,"location":2,"content":"you know, you really do want to look at the data, right?"},{"from":4248.68,"to":4254.62,"location":2,"content":"You don't just wanna be sort of having things and files and run and say \"Okay, 0.71."},{"from":4254.62,"to":4257.37,"location":2,"content":"Let me make some random change 0.70."},{"from":4257.37,"to":4260.23,"location":2,"content":"Oh, that's not a good one,\" repeat over."},{"from":4260.23,"to":4264.33,"location":2,"content":"You actually want to be sort of looking at your dataset in any way you can."},{"from":4264.33,"to":4266.76,"location":2,"content":"It's good to visualize the dataset to understand what's"},{"from":4266.76,"to":4269.5,"location":2,"content":"important in it that you might be able to take advantage of,"},{"from":4269.5,"to":4271.11,"location":2,"content":"you want to be able to look at what kind of"},{"from":4271.11,"to":4272.97,"location":2,"content":"errors are being made because that might give you"},{"from":4272.97,"to":4276.86,"location":2,"content":"ideas of how you could put more stuff into the model that would do better."},{"from":4276.86,"to":4280.47,"location":2,"content":"Um, you might wanna do some graphing of the effect of hyper-parameters,"},{"from":4280.47,"to":4282.46,"location":2,"content":"so you can kind of understand that better."},{"from":4282.46,"to":4284.37,"location":2,"content":"And so, the hope is that you will try out"},{"from":4284.37,"to":4287.25,"location":2,"content":"some other kinds of models and make things better."},{"from":4287.25,"to":4289.52,"location":2,"content":"And sort of one of the goals here is,"},{"from":4289.52,"to":4294.09,"location":2,"content":"it's good if you've sort of got a well-setup experimental setup,"},{"from":4294.09,"to":4297.3,"location":2,"content":"so you can easily turn around experiments because then you're just more"},{"from":4297.3,"to":4301.85,"location":2,"content":"likely to be able to try several things in the time available."},{"from":4301.85,"to":4305.4,"location":2,"content":"Okay. Um, couple of other things I wanted to mention."},{"from":4305.4,"to":4309.61,"location":2,"content":"Um, one is sort of different amounts of data."},{"from":4309.61,"to":4313.51,"location":2,"content":"So, it's really, really important for all the stuff that we do,"},{"from":4313.51,"to":4316.87,"location":2,"content":"that we have different sets of data."},{"from":4316.87,"to":4318.64,"location":2,"content":"So, we have trained data,"},{"from":4318.64,"to":4320.43,"location":2,"content":"we have dev test data,"},{"from":4320.43,"to":4323.13,"location":2,"content":"we have test data at least,"},{"from":4323.13,"to":4325.54,"location":2,"content":"and sometimes it's useful to have even,"},{"from":4325.54,"to":4328.24,"location":2,"content":"um, more data available."},{"from":4328.24,"to":4334.08,"location":2,"content":"So, for many of the public datasets, they're already split into different subsets like this,"},{"from":4334.08,"to":4335.1,"location":2,"content":"but there are some that aren't."},{"from":4335.1,"to":4337.28,"location":2,"content":"There are some that might only have a training set,"},{"from":4337.28,"to":4339,"location":2,"content":"and a test set."},{"from":4339,"to":4341.26,"location":2,"content":"And what you don't want to do is think,"},{"from":4341.26,"to":4343.5,"location":2,"content":"\"Oh, there's only a training set and a test set."},{"from":4343.5,"to":4346.18,"location":2,"content":"Therefore I'll just run every time on the test set.\""},{"from":4346.18,"to":4349.89,"location":2,"content":"That- that's a really invalid way to go about your research."},{"from":4349.89,"to":4350.99,"location":2,"content":"So, if there aren't"},{"from":4350.99,"to":4354.39,"location":2,"content":"dev sets available or you need to do some more tuning,"},{"from":4354.39,"to":4356.38,"location":2,"content":"and you need some separate tuning data,"},{"from":4356.38,"to":4359.46,"location":2,"content":"you sort of have to, um,"},{"from":4359.46,"to":4363.4,"location":2,"content":"make it for yourself by splitting off some of the training data,"},{"from":4363.4,"to":4367.77,"location":2,"content":"and not using it for the basic training and using it for tuning,"},{"from":4367.77,"to":4370.44,"location":2,"content":"and fo- as dev data."},{"from":4370.44,"to":4372.53,"location":2,"content":"Um, yes."},{"from":4372.53,"to":4376.49,"location":2,"content":"So, to go on about that, um, more, more."},{"from":4376.49,"to":4382.68,"location":2,"content":"So, the basic issue is this issue of fitting and overfitting to particular datasets."},{"from":4382.68,"to":4385.61,"location":2,"content":"So, when we train a model, um,"},{"from":4385.61,"to":4387.56,"location":2,"content":"on some training data,"},{"from":4387.56,"to":4390.46,"location":2,"content":"we train it and the error rate goes down."},{"from":4390.46,"to":4395.9,"location":2,"content":"And over time, we gradually overfit to the training data because we sort of"},{"from":4395.9,"to":4401.82,"location":2,"content":"pick up on our neural network f- facts about the particular training data items,"},{"from":4401.82,"to":4404.03,"location":2,"content":"and we just sort of start to learn them."},{"from":4404.03,"to":4405.79,"location":2,"content":"Now in the old days,"},{"from":4405.79,"to":4410.06,"location":2,"content":"the fact that you overfit to the training data was seen as evil."},{"from":4410.06,"to":4412.13,"location":2,"content":"In modern neural network think,"},{"from":4412.13,"to":4415.63,"location":2,"content":"we don't think it is evil what we overfit to the training data"},{"from":4415.63,"to":4420.11,"location":2,"content":"because all neural nets that are any good overfit to the training data,"},{"from":4420.11,"to":4422.88,"location":2,"content":"and we would be very sad if they didn't."},{"from":4422.88,"to":4424.66,"location":2,"content":"I'll come back to that in a moment."},{"from":4424.66,"to":4427.56,"location":2,"content":"But nevertheless, they're overfitting like crazy."},{"from":4427.56,"to":4432.92,"location":2,"content":"So, what we, but and what we want to build is something that generalizes well."},{"from":4432.92,"to":4435.09,"location":2,"content":"So, we have to have some separate data,"},{"from":4435.09,"to":4436.81,"location":2,"content":"that's our validation data,"},{"from":4436.81,"to":4441.03,"location":2,"content":"and say look at what performance looks like on the validation data."},{"from":4441.03,"to":4444.88,"location":2,"content":"And commonly we find that training up until some point,"},{"from":4444.88,"to":4448.5,"location":2,"content":"improves our performance on separate validation data,"},{"from":4448.5,"to":4451.05,"location":2,"content":"and then we start to overfit to"},{"from":4451.05,"to":4455.77,"location":2,"content":"the training data in a way that our validation set performance gets worse."},{"from":4455.77,"to":4457.6,"location":2,"content":"Um, and so, then,"},{"from":4457.6,"to":4461.97,"location":2,"content":"further training on the training data isn't useful because we're starting"},{"from":4461.97,"to":4466.7,"location":2,"content":"to build a model that generalizes worse when run on other data."},{"from":4466.7,"to":4468.81,"location":2,"content":"But there's- the whole point here is,"},{"from":4468.81,"to":4474.84,"location":2,"content":"we can only do this experiment if our validation data is separate from our training data."},{"from":4474.84,"to":4477.91,"location":2,"content":"If it's the same data or if it's overlapping data,"},{"from":4477.91,"to":4479.95,"location":2,"content":"we can't draw this graph."},{"from":4479.95,"to":4482.81,"location":2,"content":"Um, and so, therefore, we can't do valid experiments."},{"from":4482.81,"to":4487.09,"location":2,"content":"Um, now you might think, \"Oh, well,"},{"from":4487.09,"to":4489.04,"location":2,"content":"maybe I can, um,"},{"from":4489.04,"to":4492.18,"location":2,"content":"do this and just use the test set of data.\""},{"from":4492.18,"to":4495.81,"location":2,"content":"Um, but that's also invalid,"},{"from":4495.81,"to":4498.92,"location":2,"content":"and the reason why that's invalid is,"},{"from":4498.92,"to":4500.84,"location":2,"content":"as you do experiments,"},{"from":4500.84,"to":4505.49,"location":2,"content":"you also start slowly over fitting to your development data."},{"from":4505.49,"to":4511.56,"location":2,"content":"So, the standard practice is you do a run and you get a score on the development data."},{"from":4511.56,"to":4513.15,"location":2,"content":"You do a second run."},{"from":4513.15,"to":4515.04,"location":2,"content":"You do worse on the development data,"},{"from":4515.04,"to":4517.77,"location":2,"content":"and so you throw that second model away."},{"from":4517.77,"to":4519.02,"location":2,"content":"You do a third experiment."},{"from":4519.02,"to":4520.95,"location":2,"content":"You do better on the development data,"},{"from":4520.95,"to":4524.9,"location":2,"content":"and so you keep that model and you repeat over 50 times."},{"from":4524.9,"to":4528.52,"location":2,"content":"And while some of those subsequent models you keep,"},{"from":4528.52,"to":4534.19,"location":2,"content":"are genuinely better because you sort of worked out something good to do."},{"from":4534.19,"to":4538.89,"location":2,"content":"But it turns out that some of those subsequent models only sort of just happened."},{"from":4538.89,"to":4542.98,"location":2,"content":"You just got lucky and they happened to score better on the development data."},{"from":4542.98,"to":4546.9,"location":2,"content":"And so, if you kind of keep repeating that process 60 or 100 times,"},{"from":4546.9,"to":4550.57,"location":2,"content":"you're also gradually [NOISE] overfitting on your development data,"},{"from":4550.57,"to":4553.57,"location":2,"content":"and you get unrealistically good dev scores."},{"from":4553.57,"to":4555.48,"location":2,"content":"And so, that means two things."},{"from":4555.48,"to":4559.82,"location":2,"content":"You know, if you want to be rigorous and do a huge amount of hyper-parameter exploration,"},{"from":4559.82,"to":4562.83,"location":2,"content":"it can be good to have a second development se- test set,"},{"from":4562.83,"to":4565.66,"location":2,"content":"so that you have one, that you haven't overfit as much."},{"from":4565.66,"to":4568.45,"location":2,"content":"And if you want to have valid scores on te-"},{"from":4568.45,"to":4572.6,"location":2,"content":"on as to what is my actual performance on independent data,"},{"from":4572.6,"to":4575.73,"location":2,"content":"it's vital that you have separate test data that you are"},{"from":4575.73,"to":4579.27,"location":2,"content":"not using at all in this process, right?"},{"from":4579.27,"to":4581.4,"location":2,"content":"So, the ideal state is that,"},{"from":4581.4,"to":4584.86,"location":2,"content":"for your real test data, um,"},{"from":4584.86,"to":4589.59,"location":2,"content":"that you never used it at all until you've finished training your data, uh,"},{"from":4589.59,"to":4594.06,"location":2,"content":"training your model, and then you run your final model once on the test data,"},{"from":4594.06,"to":4596.51,"location":2,"content":"and you write up your paper and those are your results."},{"from":4596.51,"to":4599.49,"location":2,"content":"Now, I will be honest and say the world usually isn't"},{"from":4599.49,"to":4602.79,"location":2,"content":"quite that perfect because after you've done that,"},{"from":4602.79,"to":4604.96,"location":2,"content":"you then go to sleep [NOISE] and wake up thinking."},{"from":4604.96,"to":4607.64,"location":2,"content":"\"I've got a fantastic idea of how to make my model better.\""},{"from":4607.64,"to":4609.52,"location":2,"content":"and you run off and implement that,"},{"from":4609.52,"to":4611.7,"location":2,"content":"and it works great on the dev data,"},{"from":4611.7,"to":4615.39,"location":2,"content":"and then for you, run it on the test data again and the numbers go up."},{"from":4615.39,"to":4617.64,"location":2,"content":"Um, sort of everybody does that."},{"from":4617.64,"to":4619.03,"location":2,"content":"Um, and you know,"},{"from":4619.03,"to":4621.3,"location":2,"content":"in modicum it's okay,"},{"from":4621.3,"to":4626.32,"location":2,"content":"you know, if that means you occasionally run on the test data it's not so bad, um,"},{"from":4626.32,"to":4630.55,"location":2,"content":"but you really need to be aware of the slippery slope because,"},{"from":4630.55,"to":4633.56,"location":2,"content":"if you then start falling into, \"I've got a new model."},{"from":4633.56,"to":4634.89,"location":2,"content":"Let me try that one on the test data."},{"from":4634.89,"to":4636.93,"location":2,"content":"I've got a new model. Let me try this one on the test data.\""},{"from":4636.93,"to":4640.13,"location":2,"content":"Then you're just sort of overfitting to the test data,"},{"from":4640.13,"to":4643.1,"location":2,"content":"and getting an unrealistically high score."},{"from":4643.1,"to":4647.6,"location":2,"content":"And that's precisely why a lot of the competitions like Kaggle competitions,"},{"from":4647.6,"to":4651.68,"location":2,"content":"have a secret test dataset that you can't run on."},{"from":4651.68,"to":4653.61,"location":2,"content":"So, that they can do a genuine,"},{"from":4653.61,"to":4657.15,"location":2,"content":"independent test on the actual test data."},{"from":4657.15,"to":4662.55,"location":2,"content":"Okay. Um, let's see, um, a couple more minutes."},{"from":4662.55,"to":4666.52,"location":2,"content":"So, yeah, getting your neural network to train."},{"from":4666.52,"to":4669.14,"location":2,"content":"Um, my two messages are, you know,"},{"from":4669.14,"to":4672.43,"location":2,"content":"first of all, you should start with a positive attitude."},{"from":4672.43,"to":4674.56,"location":2,"content":"Neural networks want to learn."},{"from":4674.56,"to":4675.95,"location":2,"content":"If they're not learning,"},{"from":4675.95,"to":4678.5,"location":2,"content":"you're doing something to stop them from learning."},{"from":4678.5,"to":4680.07,"location":2,"content":"And so, you should just stop that,"},{"from":4680.07,"to":4682.26,"location":2,"content":"and they will learn because they want to learn."},{"from":4682.26,"to":4683.94,"location":2,"content":"They're just like little children."},{"from":4683.94,"to":4689.79,"location":2,"content":"Um, but, if the follow up to that is the grim reality that there are just tons"},{"from":4689.79,"to":4691.91,"location":2,"content":"of things you can do that will cause"},{"from":4691.91,"to":4695.71,"location":2,"content":"your neural networks not to learn very well or at all,"},{"from":4695.71,"to":4697.82,"location":2,"content":"and this is the frustrating part of"},{"from":4697.82,"to":4701.6,"location":2,"content":"this whole field because you know, it's not like a compile error."},{"from":4701.6,"to":4705.34,"location":2,"content":"It can just be hard to find and fix them."},{"from":4705.34,"to":4707.72,"location":2,"content":"And, you know, it is just really"},{"from":4707.72,"to":4712.02,"location":2,"content":"standard that you spend more time dealing with trying to find,"},{"from":4712.02,"to":4715.23,"location":2,"content":"and fix why it doesn't work well and getting it to work well than"},{"from":4715.23,"to":4719.27,"location":2,"content":"you- than the time you spent writing the code for your model."},{"from":4719.27,"to":4723.73,"location":2,"content":"So, remember to budget for that when you're doing your final project,"},{"from":4723.73,"to":4728.47,"location":2,"content":"it just won't work if you finish the code a day or two before the deadline."},{"from":4728.47,"to":4731.99,"location":2,"content":"Um, so, you need to work out what those things are,"},{"from":4731.99,"to":4734.97,"location":2,"content":"\"That can be hard,\" but you know experience,"},{"from":4734.97,"to":4737.26,"location":2,"content":"experimental care, rules of thumb help."},{"from":4737.26,"to":4739.75,"location":2,"content":"So, there are just lots of things that are important."},{"from":4739.75,"to":4742.48,"location":2,"content":"So, you know, your learning rates are important."},{"from":4742.48,"to":4745.77,"location":2,"content":"If your learning rates are way too high, things won't learn."},{"from":4745.77,"to":4747.96,"location":2,"content":"If your learning rates are way too low,"},{"from":4747.96,"to":4750.65,"location":2,"content":"they will learn very slowly and badly."},{"from":4750.65,"to":4753.27,"location":2,"content":"Um, initialization makes a difference."},{"from":4753.27,"to":4759.04,"location":2,"content":"Having good initialization often determines how well neural networks, um, learn."},{"from":4759.04,"to":4763.44,"location":2,"content":"Um, I have a separate slide here that I probably haven't got time to go"},{"from":4763.44,"to":4768.23,"location":2,"content":"through all of on sort of for sequence [NOISE] models,"},{"from":4768.23,"to":4771.95,"location":2,"content":"some of the tips of what people normally think are"},{"from":4771.95,"to":4775.73,"location":2,"content":"good ways to get those models, um, working."},{"from":4775.73,"to":4778.41,"location":2,"content":"But I'll just say this one last thing."},{"from":4778.41,"to":4781.85,"location":2,"content":"Um, I think the strategy that you really want to"},{"from":4781.85,"to":4785.92,"location":2,"content":"take is to work incrementally and build up slowly."},{"from":4785.92,"to":4787.49,"location":2,"content":"It just doesn't work to think,"},{"from":4787.49,"to":4789.53,"location":2,"content":"\"Oh I've got the mother of all models,"},{"from":4789.53,"to":4791.66,"location":2,"content":"and build this enormously complex thing,"},{"from":4791.66,"to":4793,"location":2,"content":"and then run it on the data,"},{"from":4793,"to":4794.78,"location":2,"content":"and it crashes and burns.\""},{"from":4794.78,"to":4797.45,"location":2,"content":"You have no idea what to do at that point,"},{"from":4797.45,"to":4800.65,"location":2,"content":"that the only good way is to sort of build up slowly."},{"from":4800.65,"to":4802.94,"location":2,"content":"So [NOISE] start with a very simple model,"},{"from":4802.94,"to":4804.3,"location":2,"content":"get it to work,"},{"from":4804.3,"to":4805.82,"location":2,"content":"add your bells and whistles,"},{"from":4805.82,"to":4807.49,"location":2,"content":"extra layers and so on."},{"from":4807.49,"to":4809.59,"location":2,"content":"Get them to work or abandon them."},{"from":4809.59,"to":4814.23,"location":2,"content":"And so, try and proceed from one working model to another as much as possible."},{"from":4814.23,"to":4818.98,"location":2,"content":"One of- another way that you can start small and build up is with data."},{"from":4818.98,"to":4822.58,"location":2,"content":"The easiest way to see bugs and problems in your model,"},{"from":4822.58,"to":4825.61,"location":2,"content":"is with the minutest possible amount of data."},{"from":4825.61,"to":4829.03,"location":2,"content":"So, start with a dataset of eight items."},{"from":4829.03,"to":4832.66,"location":2,"content":"Sometimes it's even best if those eight items are ones that are"},{"from":4832.66,"to":4834.93,"location":2,"content":"artificial data that you designed yourself"},{"from":4834.93,"to":4837.56,"location":2,"content":"because then you can often more easily see problems,"},{"from":4837.56,"to":4838.81,"location":2,"content":"and what's going wrong."},{"from":4838.81,"to":4840.56,"location":2,"content":"So, you should train on that,"},{"from":4840.56,"to":4842.42,"location":2,"content":"um, because it's only eight items,"},{"from":4842.42,"to":4844.12,"location":2,"content":"training will only take seconds,"},{"from":4844.12,"to":4847.2,"location":2,"content":"and that's really, really useful for being able to iterate quickly."},{"from":4847.2,"to":4849.56,"location":2,"content":"And you know, if you can't have your model get"},{"from":4849.56,"to":4855.06,"location":2,"content":"100 percent accuracy on training and testing on those eight examples,"},{"from":4855.06,"to":4859.73,"location":2,"content":"well, you know, either the model is woefully under powered or the model is broken,"},{"from":4859.73,"to":4862.9,"location":2,"content":"and you've got clear things to do right there."},{"from":4862.9,"to":4866.4,"location":2,"content":"Um, when you go to a bigger model, um,"},{"from":4866.4,"to":4870.11,"location":2,"content":"the standard practice with modern neural networks is,"},{"from":4870.11,"to":4872.33,"location":2,"content":"you want to train your models."},{"from":4872.33,"to":4876.24,"location":2,"content":"You want models that can overfit massively on the training set."},{"from":4876.24,"to":4879.56,"location":2,"content":"So, in general, your models should still be getting"},{"from":4879.56,"to":4883.38,"location":2,"content":"close to 100 percent accuracy on the training set after you've"},{"from":4883.38,"to":4887.16,"location":2,"content":"trained it for a long time because powerful neural network models are"},{"from":4887.16,"to":4891.09,"location":2,"content":"just really good at over-fitting to, and memorizing data."},{"from":4891.09,"to":4893.45,"location":2,"content":"Um, if that's not the case well, you know,"},{"from":4893.45,"to":4894.81,"location":2,"content":"maybe you want a bigger model."},{"from":4894.81,"to":4898.16,"location":2,"content":"Maybe you want to have higher hidden dimensions or"},{"from":4898.16,"to":4901.91,"location":2,"content":"add an extra layer to your neural network or something like that."},{"from":4901.91,"to":4904.93,"location":2,"content":"You shouldn't be scared of overfitting on the training data."},{"from":4904.93,"to":4907.18,"location":2,"content":"But once you've proved you can do that,"},{"from":4907.18,"to":4910.56,"location":2,"content":"you then do want a model that also generalizes well."},{"from":4910.56,"to":4915.41,"location":2,"content":"And so, normally the way that you're addressing that is then by regularizing the model,"},{"from":4915.41,"to":4917.85,"location":2,"content":"and there are different ways to regularize your model,"},{"from":4917.85,"to":4921.3,"location":2,"content":"but we talked about in the assignment, doing dropout."},{"from":4921.3,"to":4923.76,"location":2,"content":"I mean, using generous dropout is"},{"from":4923.76,"to":4927.98,"location":2,"content":"one very common and effective strategy for regularizing your models."},{"from":4927.98,"to":4931.73,"location":2,"content":"And so, then you've, what you want to be doing is regularizing"},{"from":4931.73,"to":4936.27,"location":2,"content":"your model enough that the curve no longer looks like this,"},{"from":4936.27,"to":4941.1,"location":2,"content":"but instead that your validation performance kind of levels out,"},{"from":4941.1,"to":4943.51,"location":2,"content":"but doesn't start ramping back up again,"},{"from":4943.51,"to":4946.82,"location":2,"content":"and that's then a sort of a sign of a well regularized model."},{"from":4946.82,"to":4949.21,"location":2,"content":"Okay. I will stop there,"},{"from":4949.21,"to":4953.31,"location":2,"content":"and then we'll come back to the question-answering project on Thursday."}]} \ No newline at end of file