00001 import java.io.File;
00002 import java.io.FileNotFoundException;
00003 import java.io.FileWriter;
00004 import java.io.IOException;
00005 import java.io.PrintStream;
00006 import java.util.ArrayList;
00007 import java.util.HashMap;
00008 import java.util.Vector;
00009 import java.util.regex.Matcher;
00010 import java.util.regex.Pattern;
00011 import java.io.FileReader;
00012 import java.io.BufferedReader;
00013
00014 import edu.ksu.cis.bnj.ver3.core.BeliefNode;
00015 import edu.tum.cs.srldb.Database;
00016 import edu.tum.cs.srldb.Link;
00017 import edu.tum.cs.srldb.datadict.DDAttribute;
00018 import edu.tum.cs.srldb.datadict.DDRelation;
00019 import edu.tum.cs.srldb.datadict.DataDictionary.BLNStructure;
00020
00021 public class MSNBCtoBlodDB {
00022
00023
00024 public static void readData(String datadir) throws FileNotFoundException, Exception {
00025
00026 FileWriter fw_blogdb = null;
00027 FileWriter fw_lastCSV = null;
00028
00029 Database db = new Database();
00030
00031 edu.tum.cs.srldb.Object currentPage=null;
00032 Vector<Link> uncommitedLinks = new Vector<Link>();
00033
00034 HashMap<String, String> id2cat = new HashMap<String, String>();
00035 id2cat.put("1","Frontpage");
00036 id2cat.put("2","News");
00037 id2cat.put("3","Tech");
00038 id2cat.put("4","Local");
00039 id2cat.put("5","Opinion");
00040 id2cat.put("6","On-air");
00041 id2cat.put("7","Misc");
00042 id2cat.put("8","Weather");
00043 id2cat.put("9","Msn-news");
00044 id2cat.put("10","Health");
00045 id2cat.put("11","Living");
00046 id2cat.put("12","Business");
00047 id2cat.put("13","Msn-sports");
00048 id2cat.put("14","Sports");
00049 id2cat.put("15","Summary");
00050 id2cat.put("16","Bbs");
00051 id2cat.put("17","Travel");
00052
00053
00054 try
00055 {
00056
00057 File inputFile = new File("data/"+datadir+"/msnbc990928.seq");
00058 BufferedReader input = new BufferedReader(new FileReader(inputFile));
00059 ArrayList<edu.tum.cs.srldb.Object> prevSegmInEpsiode = new ArrayList<edu.tum.cs.srldb.Object>();
00060
00061 String line;
00062 int episode = 0;
00063 int pageCnt=0;
00064
00065
00066 int numSamplesDrawn=0;
00067
00068 int numTrain=1000;
00069 int numTest=100;
00070 String mode="train";
00071
00072
00073 fw_lastCSV = new FileWriter("data/"+datadir+"/"+mode+"/lastClass.csv" );
00074
00075 while (( line = input.readLine()) != null) {
00076
00077 Matcher matcher = Pattern.compile("^[[0-9]* ]+$").matcher(line.trim());
00078 if(!matcher.find()) {continue;}
00079
00080 String[] pages = line.split(" ");
00081 if(pages.length<=4) continue;
00082 if(pages.length>=20) continue;
00083
00084
00085 if(mode.equals("train") && numSamplesDrawn == numTrain) {
00086 mode="test"; numSamplesDrawn=0;
00087 } else if(mode.equals("test") && numSamplesDrawn == numTest) {
00088 return;
00089 }
00090 numSamplesDrawn++;
00091
00092
00093 prevSegmInEpsiode.clear();
00094
00095 pageCnt=0;
00096
00097 fw_blogdb = new FileWriter("data/"+datadir+"/"+mode+"/data" + episode + ".blogdb" );
00098
00099 String prevLabel=null;
00100 for(String page:pages) {
00101
00102
00103 String label = id2cat.get(page);
00104 String pageID="P_" + episode+"_"+ pageCnt++ +"_"+label;
00105
00106 currentPage = new edu.tum.cs.srldb.Object(db, "page", pageID);
00107 currentPage.addAttribute("pageT", label);
00108
00109
00110
00111 if(pageCnt==pages.length-1) {
00112 fw_lastCSV.write(label+",");
00113 prevLabel=label;
00114 } else if(pageCnt==pages.length) {
00115 fw_lastCSV.write(label+",");
00116 if(prevLabel!=null) {
00117 fw_lastCSV.write(((label.equals(prevLabel))?"1":"0")+"\n");
00118 prevLabel=null;
00119 }
00120 }
00121
00122
00123 if(pageCnt==1) {
00124 currentPage.addAttribute("firstPage", "True");
00125 fw_blogdb.write("firstPage("+pageID+")=True;\n");
00126 }
00127 else if(pageCnt==pages.length) {
00128 currentPage.addAttribute("lastPage", "True");
00129 fw_blogdb.write("lastPage("+pageID+")=True;\n");
00130 }
00131
00132
00133 if(mode.equals("train") || pageCnt<pages.length)
00134 fw_blogdb.write("pageT("+pageID+")="+label+";\n");
00135
00136
00137 for(edu.tum.cs.srldb.Object prevPage : prevSegmInEpsiode) {
00138 fw_blogdb.write("precedes("+prevPage.getConstantName() + ","+pageID+")=True;\n");
00139 uncommitedLinks.add(new Link(db, "precedes", prevPage, currentPage));
00140 }
00141
00142
00143
00144 prevSegmInEpsiode.add(currentPage);
00145
00146
00147 if(currentPage!=null) currentPage.commit();
00148
00149 }
00150
00151
00152 for(Link o : uncommitedLinks)
00153 o.commit();
00154 uncommitedLinks.clear();
00155
00157
00158
00159 edu.tum.cs.srldb.datadict.DataDictionary dd = db.getDataDictionary();
00160 db.check();
00161 BLNStructure bs = dd.createBasicBLNStructure();
00162
00163 DDAttribute frametT_rel = dd.getAttribute("pageT");
00164 DDRelation prec_rel = dd.getRelation("precedes");
00165
00166 DDRelation first_rel = dd.getRelation("firstPage");
00167 DDRelation last_rel = dd.getRelation("lastPage");
00168
00169 BeliefNode frameF1 = bs.getNode(frametT_rel);
00170 frameF1.setName("pageT(p1)");
00171 BeliefNode frameF2 = bs.bn.addNode("#pageT(p2)");
00172
00173 BeliefNode _f1f2 = bs.bn.addDecisionNode("!(p1=p2)");
00174 BeliefNode precedes = bs.getNode(prec_rel);
00175 precedes.setName("precedes(p1, p2)");
00176
00177 if(first_rel!=null) {
00178 BeliefNode firstF1 = bs.getNode(first_rel);
00179 firstF1.setName("firstPage(p1)");
00180 BeliefNode firstF2 = bs.bn.addNode("#firstPage(p2)");
00181 bs.bn.bn.connect(firstF1, precedes);
00182 bs.bn.bn.connect(firstF2, precedes);
00183 }
00184 if(last_rel!=null) {
00185 BeliefNode lastF1 = bs.getNode(last_rel);
00186 lastF1.setName("lastPage(p1)");
00187 BeliefNode lastF2 = bs.bn.addNode("#lastPage(p2)");
00188 bs.bn.bn.connect(lastF1, precedes);
00189 bs.bn.bn.connect(lastF2, precedes);
00190 }
00191
00192 BeliefNode f1f2 = bs.bn.addDecisionNode("p1=p2");
00193 BeliefNode precedes2 = bs.bn.addNode("precedes(p1, p2)");
00194 bs.bn.bn.connect(f1f2, precedes2);
00195
00196 bs.bn.bn.connect(_f1f2, precedes);
00197 bs.bn.bn.connect(frameF1, precedes);
00198 bs.bn.bn.connect(frameF2, precedes);
00199
00200
00201 bs.bn.savePMML("data/"+datadir+"/"+mode+ "/data.pmml");
00202
00203 bs.bn.save("data/"+datadir+"/"+mode+ "/data.net");
00204
00205 PrintStream bln = new PrintStream(new File("data/"+datadir+"/"+mode+ "/data.abl"));
00206 dd.writeBasicBLOGModel(bln);
00207 bln.close();
00208
00209
00210 fw_blogdb.close();
00211 episode++;
00212
00213
00214 }
00215 fw_lastCSV.close();
00216
00217 }
00218 catch ( IOException e ) {
00219 System.err.println( "Could not create CSV files" );
00220 }
00221 catch ( Exception e ) {
00222 System.err.println( e.toString() );
00223 }
00224 }
00225
00226 public static void main(String[] args) throws FileNotFoundException, Exception {
00227 readData("msnbc");
00228 }
00229 }