00001 package edu.tum.cs.srldb;
00002
00003 import java.io.FileInputStream;
00004 import java.io.FileOutputStream;
00005 import java.io.IOException;
00006 import java.io.ObjectInputStream;
00007 import java.io.ObjectOutputStream;
00008 import java.io.PrintStream;
00009 import java.io.Serializable;
00010 import java.util.Collection;
00011 import java.util.HashMap;
00012 import java.util.HashSet;
00013 import java.util.Iterator;
00014 import java.util.Vector;
00015
00016 import edu.tum.cs.clustering.BasicClusterer;
00017 import edu.tum.cs.clustering.ClusterNamer;
00018 import edu.tum.cs.clustering.EMClusterer;
00019 import edu.tum.cs.clustering.SimpleClusterer;
00020 import edu.tum.cs.srldb.datadict.AutomaticDataDictionary;
00021 import edu.tum.cs.srldb.datadict.DDAttribute;
00022 import edu.tum.cs.srldb.datadict.DDException;
00023 import edu.tum.cs.srldb.datadict.DataDictionary;
00024 import edu.tum.cs.srldb.datadict.domain.AutomaticDomain;
00025 import edu.tum.cs.srldb.datadict.domain.Domain;
00026 import edu.tum.cs.srldb.datadict.domain.OrderedStringDomain;
00027
00028 public class Database implements Cloneable, Serializable {
00029
00033 private static final long serialVersionUID = 1L;
00034 protected HashSet<Link> links;
00035 protected HashSet<Object> objects;
00036 protected DataDictionary datadict;
00037
00042 public Database(DataDictionary dd) {
00043 links = new HashSet<Link>();
00044 objects = new HashSet<Object>();
00045 datadict = dd;
00046 }
00047
00051 public Database() {
00052 this(new AutomaticDataDictionary());
00053 }
00054
00064 public static AttributeClustering clusterAttribute(DDAttribute attribute, Collection<Object> objects, BasicClusterer<? extends weka.clusterers.Clusterer> clusterer, ClusterNamer<weka.clusterers.Clusterer> clusterNamer) throws DDException, Exception {
00065 String attrName = attribute.getName();
00066
00067 int instances = 0;
00068 for(Object obj : objects) {
00069 String value = obj.getAttributeValue(attrName);
00070 if(value == null)
00071 continue;
00072 clusterer.addInstance(Double.parseDouble(value));
00073 instances++;
00074 }
00075
00076 clusterer.buildClusterer();
00077
00078 String[] clusterNames = clusterNamer.getNames(clusterer.getWekaClusterer());
00079
00080 if(instances < clusterNames.length) {
00081 System.err.println("Warning: attribute " + attrName + " was discarded because there are too few instances for clustering");
00082 attribute.discard();
00083 return null;
00084 }
00085 if(instances == 0)
00086 throw new Exception("The domain is empty; No instances could be clustered for attribute " + attrName);
00087
00088 AttributeClustering ac = new AttributeClustering();
00089 ac.clusterer = clusterer;
00090 ac.newDomain = new OrderedStringDomain(attribute.getDomain().getName(), clusterNames);
00091 applyClustering(attribute, objects, ac);
00092 return ac;
00093 }
00094
00095 public static void applyClustering(DDAttribute attribute, Collection<Object> objects, AttributeClustering ac) throws NumberFormatException, Exception {
00096
00097 String attrName = attribute.getName();
00098 for(Object obj : objects) {
00099 String value = obj.attribs.get(attrName);
00100 if(value != null) {
00101 int i = ac.clusterer.classify(Double.parseDouble(value));
00102 String svalue = ac.newDomain.getValues()[i];
00103 obj.attribs.put(attrName, svalue);
00104
00105
00106
00107
00108 }
00109 }
00110
00111 attribute.setDomain(ac.newDomain);
00112 }
00113
00114 public void writeMLNDatabase(PrintStream out) throws Exception {
00115 out.println("// *** mln database ***\n");
00116
00117 out.println("// links");
00118 for(Link link : links)
00119 link.MLNprintFacts(out);
00120
00121 Counters cnt = new Counters();
00122 for(Object obj : objects) {
00123 out.println("// " + obj.objType() + " #" + cnt.inc(obj.objType()));
00124 obj.MLNprintFacts(out);
00125 }
00126 }
00127
00128 public void writeBLOGDatabase(PrintStream out) throws DDException {
00129 for(Object obj : objects) {
00130 obj.BLOGprintFacts(out);
00131 }
00132 for(Link link : links) {
00133 link.BLOGprintFacts(out);
00134 }
00135 }
00136
00141 public void writeBasicMLN(PrintStream out) {
00142 datadict.writeBasicMLN(out);
00143 }
00144
00150 public void writeSRLDB(FileOutputStream s) throws IOException {
00151
00152
00153 for(Object o : this.objects)
00154 o.database = this;
00155 for(Link l : this.links)
00156 l.database = this;
00157
00158 this.datadict.cleanUp();
00159
00160 ObjectOutputStream objstream = new ObjectOutputStream(s);
00161 objstream.writeObject(this);
00162 objstream.close();
00163 }
00164
00172 public static Database fromFile(FileInputStream s) throws IOException, ClassNotFoundException {
00173 ObjectInputStream objstream = new ObjectInputStream(s);
00174 java.lang.Object object = objstream.readObject();
00175 objstream.close();
00176 return (Database)object;
00177 }
00178
00184 public void writeProximityDatabase(java.io.PrintStream out) throws Exception {
00185 out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
00186 out.println("<!DOCTYPE PROX3DB SYSTEM \"prox3db.dtd\">");
00187 out.println("<PROX3DB>");
00188
00189 out.println(" <OBJECTS>");
00190 for(Object obj : objects) {
00191 out.println(" <OBJECT ID=\"" + obj.id + "\"/>");
00192 }
00193 out.println(" </OBJECTS>");
00194
00195 out.println(" <LINKS>");
00196 for(Link link : links) {
00197 if(link.getArguments().length != 2)
00198 System.err.println("Warning: non-binary link/relation found - using first two objects only");
00199 Object o1 = ((Object)link.getArguments()[0]);
00200 Object o2 = ((Object)link.getArguments()[1]);
00201 out.println(" <LINK ID=\"" + link.id + "\" O1-ID=\"" + o1.id + "\" O2-ID=\"" + o2.id + "\"/>");
00202 }
00203 out.println(" </LINKS>");
00204
00205 out.println(" <ATTRIBUTES>");
00206
00207 for(DDAttribute attrib : datadict.getAttributes()) {
00208 if(attrib.isDiscarded())
00209 continue;
00210 String attribName = attrib.getName();
00211 System.out.println(" attribute " + attribName);
00212 out.println(" <ATTRIBUTE NAME=\"" + Database.stdAttribName(attribName) + "\" ITEM-TYPE=\"" + (attrib.getOwner().isObject() ? "O" : "L") + "\" DATA-TYPE=\"" + attrib.getType() + "\">");
00213 Iterator iItem = attrib.getOwner().isObject() ? objects.iterator() : links.iterator();
00214 while(iItem.hasNext()) {
00215 Item item = (Item) iItem.next();
00216 if(item.hasAttribute(attribName)) {
00217 out.println(" <ATTR-VALUE ITEM-ID=\"" + item.id + "\">");
00218 out.println(" <COL-VALUE>" + Database.stdAttribStringValue(item.attribs.get(attribName)) + "</COL-VALUE></ATTR-VALUE>");
00219 }
00220 }
00221 out.println(" </ATTRIBUTE>");
00222 }
00223
00224 out.println(" <ATTRIBUTE NAME=\"objtype\" ITEM-TYPE=\"O\" DATA-TYPE=\"str\">");
00225 for(Object obj : objects) {
00226 out.println(" <ATTR-VALUE ITEM-ID=\"" + obj.id + "\">");
00227 out.println(" <COL-VALUE>" + Database.stdAttribStringValue(obj.objType()) + "</COL-VALUE></ATTR-VALUE>");
00228 }
00229 out.println(" </ATTRIBUTE>");
00230
00231 out.println(" <ATTRIBUTE NAME=\"link_tag\" ITEM-TYPE=\"L\" DATA-TYPE=\"str\">");
00232 for(Link link : links) {
00233 out.println(" <ATTR-VALUE ITEM-ID=\"" + link.id + "\">");
00234 out.println(" <COL-VALUE>" + link.getName() + "</COL-VALUE></ATTR-VALUE>");
00235 }
00236 out.println(" </ATTRIBUTE>");
00237 out.println(" </ATTRIBUTES>");
00238
00239 out.println("</PROX3DB>");
00240 }
00241
00247 public static String lowerCaseString(String s) {
00248 char[] c = s.toCharArray();
00249 c[0] = Character.toLowerCase(c[0]);
00250 return new String(c);
00251 }
00252
00258 public static String upperCaseString(String s) {
00259 char[] c = s.toCharArray();
00260 c[0] = Character.toUpperCase(c[0]);
00261 return new String(c);
00262 }
00263
00264 public static String stdAttribName(String attribName) {
00265 return lowerCaseString(attribName);
00266 }
00267
00268 public static String stdPredicateName(String name) {
00269 return lowerCaseString(name);
00270 }
00271
00272 public static String stdDomainName(String domainName) {
00273 return lowerCaseString(domainName);
00274 }
00275
00276 public static String stdAttribStringValue(String strValue) {
00277
00278 char[] value = strValue.toCharArray();
00279 value[0] = Character.toUpperCase(value[0]);
00280
00281 int len = 1;
00282 for(int i = 1; i < value.length;) {
00283 if(value[i] == ' ') {
00284 value[len++] = Character.toUpperCase(value[++i]);
00285 i++;
00286 }
00287 else
00288 value[len++] = value[i++];
00289 }
00290 return new String(value, 0, len);
00291 }
00292
00297 public void check() throws DDException, Exception {
00298
00299 for(Object obj : objects) {
00300 datadict.checkObject(obj);
00301 }
00302
00303 for(Link link : this.links) {
00304 datadict.checkLink(link);
00305 }
00306
00307 datadict.check();
00308 }
00309
00310 public static class AttributeClustering {
00311 public BasicClusterer<?> clusterer;
00312 public Domain<?> newDomain;
00313 }
00314
00320 public HashMap<DDAttribute, AttributeClustering> doClustering(HashMap<DDAttribute, AttributeClustering> clusterers) throws DDException, Exception {
00321 System.out.println("clustering...");
00322 if(clusterers == null)
00323 clusterers = new HashMap<DDAttribute, AttributeClustering>();
00324 for(DDAttribute attrib : this.datadict.getAttributes()) {
00325 if(attrib.isDiscarded())
00326 continue;
00327 if(attrib.requiresClustering()) {
00328 System.out.println(" " + attrib.getName());
00329 AttributeClustering ac;
00330 ac = clusterers.get(attrib);
00331 if(ac != null) {
00332 applyClustering(attrib, objects, ac);
00333 continue;
00334 }
00335 Domain<?> domain = attrib.getDomain();
00336
00337
00338
00339 if(domain instanceof OrderedStringDomain) {
00340 SimpleClusterer c = new SimpleClusterer();
00341 ((SimpleClusterer)c).setNumClusters(domain.getValues().length);
00342 ac = clusterAttribute(attrib, objects, c, new ClusterNamer.Fixed(((OrderedStringDomain)domain).getValues()));
00343 }
00344
00345
00346
00347 else if(domain instanceof AutomaticDomain) {
00348 BasicClusterer<?> c;
00349 Integer numClusters = attrib.getNumClusters();
00350 if(numClusters == null)
00351 c = new EMClusterer();
00352 else {
00353 c = new SimpleClusterer();
00354 ((SimpleClusterer)c).setNumClusters(numClusters);
00355 }
00356 ac = clusterAttribute(attrib, objects, c, new ClusterNamer.SimplePrefix(attrib.getName()));
00357 }
00358 else
00359 throw new DDException("Don't know how to perform clustering for target domain " + " (" + domain.getClass() + ")");
00360 System.out.println(" " + ac.newDomain);
00361 clusterers.put(attrib, ac);
00362 }
00363 }
00364 return clusterers;
00365 }
00366
00367 public HashMap<DDAttribute, AttributeClustering> doClustering() throws DDException, Exception {
00368 return doClustering(null);
00369 }
00370
00371 public Database clone() {
00372 try {
00373 return (Database)super.clone();
00374 }
00375 catch (CloneNotSupportedException e) { return null; }
00376 }
00377
00378 public Collection<Link> getLinks() {
00379 return links;
00380 }
00381
00387 public Vector<Link> getLinks(Object o) {
00388 Vector<Link> v = new Vector<Link>();
00389 for(Link l : this.links) {
00390 for(int i = 0; i < l.arguments.length; i++)
00391 if(l.arguments[i] == o)
00392 v.add(l);
00393 }
00394 return v;
00395 }
00396
00397 public Collection<Object> getObjects() {
00398 return objects;
00399 }
00400
00401 public void addObject(Object obj) throws DDException {
00402 if(objects.add(obj))
00403 this.datadict.onCommitObject(obj);
00404 }
00405
00406 public void addLink(Link l) throws DDException {
00407 if(links.add(l))
00408 this.datadict.onCommitLink(l);
00409 }
00410
00411 public DataDictionary getDataDictionary() {
00412 return datadict;
00413 }
00414
00415
00416 public void setDataDictionary(DataDictionary dd) {
00417 this.datadict = dd;
00418 }
00419
00420 public static class Counters {
00421 protected HashMap<String, Integer> counters;
00422 public Counters() {
00423 counters = new HashMap<String, Integer>();
00424 }
00425 public Integer inc(String name) {
00426 Integer c = counters.get(name);
00427 if(c == null)
00428 counters.put(name, c=new Integer(1));
00429 else
00430 counters.put(name, c=new Integer(c+1));
00431 return c;
00432 }
00433 public String toString() {
00434 return counters.toString();
00435 }
00436 }
00437
00441 public void clear() {
00442 this.objects.clear();
00443 this.links.clear();
00444 }
00445 }