Sta$s$cal(Machine(learning(Lecture(24(Yuan((Alan)(Qi(Outline ((• Review(of(the(sum1product(algorithm(• The(max1sum(algorithm,(the(Viterbi(algorithm(• K1means(clustering((• K1medoids,(Mixture(of(Gaussians,(Vector(quanAzaAon,(ExpectaAon(maximizaAon((EM),(AlternaAve(view(of(EM(Sum1Product:(Example((1)(Sum1Product:(Example((2)(Sum1Product:(Example((3)(Sum1Product:(Example((4)(The(JuncAon(Tree(Algorithm(• Exact(inference(on(general(graphs.(• Works(by(turning(the(iniAal(graph(into(a(junc)on+tree(and(then(running(a(sum1product1like(algorithm.(• Intractable(on(graphs(with(large(cliques.(Loopy(Belief(PropagaAon(• Sum1Product(on(general(graphs.(• IniAal(unit(messages(passed(across(all(links,(aVer(which(messages(are(passed(around(unAl(convergence((not(guaranteed!).(• Approximate(but(tractable(for(large(graphs.(• SomeAme(works(well,(someAmes(not(at(all.(The(Max1Su m(Algorithm((1)(ObjecAve:(an(efficient(algorithm(for(finding((i. the(valu e(xmax(that(maximises(p(x);(ii. the(valu e((of(p(xmax).(In(general,(maximum(marginals(≠(joint(maximum.(The(Max1Su m(Algorithm((2)(Maximizing(over(a(chain((max1product)(The(Max1Su m(Algorithm((3)(Generalizes(to(tree1structured(factor(graph(maximizing(as(close(to(the(leaf(nodes(as(possible(The(Max1Su m(Algorithm((4)(Max1Product(→(Max1Su m(For(numerical(reasons,(use(Again,(use(distribuAve(law((The(Max1Su m(Algorithm((5)(IniAalizaAon((leaf(nodes)(Recursion(The(Max1Su m(Algorithm((6)(TerminaAon:(The(Max1Su m(Algorithm((7)(Backtracking((The(previous(updates(give(us(the(maximum(of(the(model(probability.(But(how(about(finding(the(global(configuraAon(that(gives(the(maximal(probab il ity?(Backtracking((The(previous(updates(give(us(the(maximum(of(the(model(probability.(But(how(about(the(global(configuraAon(that(gives(the(maximal(probability?(We(need(to(store(a(quanAty(to(tell(us(how(to(trace(back(to(the(value(that(maximize(the(previous(sub1problem.(This(is(know(as(back6tracking.+Related(algorithms ((Same(as(dynamic(programming(Known(as(the(Viterbi(algorithm(for(Hi dden(Markov(models(Unsupervised(Learning ((Supervised(learning:(learning(with(examples(or(labels,(e.g.,((classificaAon (and (regression(Unsupervised(learning:(learning(without++examples(or(l abels,(e.g.,(clustering,(mixture(models,(PCA,(non1 negaAve(matrix(factorizaAon(K1means(Clustering:(Goal(Cost(Fu ncAon (Two(Stage(Updates(OpAmizing(Cluster(Assignment(OpAmizing(Cluster(Centers(Convergence(of(IteraAve(Updates(Example(of(K1Means(Clustering(StochasAc(Online(Clustering(K1medoids(Algorithm(K1Means(and(Vector(QuanAzaAon(: Code-book vectors for lossy data compressionLimitaAons(o f(K1means(Mixture(of(Gaussians(Mixture(of(Gaussians:(Introduce(latent(variables:(Marginal(di stribuAon:((CondiAonal(Probability(Responsibility(that(component(k+takes(for(explaining(the(observaAon.+Maximum(Likelihood(Maximize(the(log(likelihood(funcAon(An(easy(soluAon(Discussion(Severe(Overfifng(by(Maxi mum(Li kelihood(When(a(cluster(has(only(data(point,(its(variance(goes(t o(0.(IdenAfiability(For(any(given(maximu m(likelihood(soluAon,(a(K6component(mixture(will(have(a(total(of(K!+equivalent(soluAons(corresponding(to(the(K!+ways(of(assigning(K+sets(of(parameters(to(K+compon ents.(Why?(Difficulty(for(parameter(interpretaAon;(irrelevant(for(density(esAmaAon.(If(we(flip(the(labels(of(the(components,(we(will(have(an(equivalent(model.((Maximum(Likelihood(C ond iAo ns((1)(Sefng(the(derivaAves(of((((((((((((((((((((((((to(zero:(Maximum(Likelihood(C ond iAo ns((2)(Sefng(the(derivaAve(of(((((((((((((((((((((((((to(zero:(Maximum(Likelihood(C ond iAo ns((3)(Lagrange(funcAon:(Sefng(its(derivaAve(to(zero(and(use(the(normalizaAon(constraint,(we(obtain:(Question: can we use these conditions to construct an useful estimation algorithm?ExpectaAon(MaximizaAon( for(Mixture(Gaussians(Although(the(previous(con di Aons( do(n ot(provide(closed1form(condiAons,(we(can(use(them(to(construct(iteraAve(updates:(E(step:(Compute(responsibiliAes((((((((((.(M(step:(Compute(new(mean(((((,(variance((((((,(and(mixing(coefficients(((((.(Loop(over(E(and (M(steps(unAl(the(log(likelihood((((((((((((((((((((((stops(to(increase.(Example(EM(on(the(Old(Faithful(data(set.(General(E M(Algorithm(EM(as(Lower(Bounding(Methods(Goal:(maximize((Define:(We(have(Lower(Bound((((((((((is(a(funcAonal(of(the(distribuAon(((((((.(Since((((((((((((((((((((and(((((((((((((((((((((((((((((((((((((((((( ,((((((((((((is(a(lower(bound(of(the(log(likelihood (funcAon((((((((((((((.(IllustraAon (of(L ower(Bound((Lower(Bound(PerspecAve(of(EM(• ExpectaAon(Step:((Maximizing(the(funcAonal(lower(bound(((((((((((over(the(distribuAon((((((((.(• MaximizaAon(S tep:((Maximizing(the(lower(bound(over(the(parameters(((.(IllustraAon (of(E
View Full Document