Projet Text Mining 1. (a) http://www.cad.zju.edu.cn/home/dengcai/Data/TextData.html ou l'adresse que vous avez trouvé (b) reut = load('Reuters.mat'); reuters_data = tfidf(reut.fea); reuters_labels = reut.gnd; (c) reuters4 = [reuters_data(1:100,:);reuters_data(3714:3813,:);reuters_data(5769:5868,:);reuters_data(6090:6189,:)]; labels4 = [ones(100,1);2*ones(100,1);3*ones(100,1);4*ones(100,1)]; (d) reut_train = [reuters4(1:75,:);reuters4(101:175,:);reuters4(201:275,:);reuters_data(301:375,:)]; reut_test = [reuters4(76:100,:);reuters4(176:200,:);reuters4(276:300,:);reuters_data(376:400,:)]; labels_test = [ones(25,1);2*ones(25,1);3*ones(25,1);4*ones(25,1)]; labels_train = [ones(75,1);2*ones(75,1);3*ones(75,1);4*ones(75,1)]; (e) function Data_r = kfreq(Data,k) [A,I] = sort(sum(Data,1),'descend'); Data_r = Data(:,sort(I(1:k))); reut_train_r = kfreq(reut_train,3); (f) PlotClusters(reut_train_r,labels_train); 2. (a) [W H] = nmfrule(reut_train',4); function [U K] = show_clusters(X) [R C] = size(X); K = X; U = zeros(R,1); for i = 1:R for j = 1:C if K(i,j) == max(X(i,:)) K(i,j) = 1; else K(i,j) = 0; end if X(i,j) == max(X(i,:)) U(i) = j; end end end dif_values = unique_no_sort(U); for i = 1:size(U,1) for j = 1:size(dif_values,2) if U(i,1)== dif_values(1,j) U(i,1) = j; break; end end end (b) H_test = pinv(W)*reut_test';