#!/usr/bin/env python import sys import contextlib verbosity = 0 class Graph(object): def __init__(self): self.nodes = [] self.labels = {} self.edges = set() def addVertex(self, id, label): if id != len(self.nodes): raise IndexError("Nodes must be given in order") self.nodes.append(label) self.labels[label] = id def addEdge(self, v, u): if v < 0 or v >= len(self.nodes) or u < 0 or u >= len(self.nodes): raise ValueError("Illegal vertex id's") self.edges.add((v, u)) def toString(self): out = 't\n' for i in range(len(self.nodes)): out += 'v {} {}\n'.format(i, self.nodes[i]) for e in self.edges: out += 'e {0[0]} {0[1]}\n'.format(e) class TransactionData(object): def __init__(self): self.transactions = [] self.items = set() def addTransaction(self, transaction): t = set(transaction) # Make sure the input is a set self.transactions.append(t) self.items.update(t) def supportSet(self, item): if isinstance(item, set) or isinstance(item, frozenset): return [i for i in range(len(self.transactions)) if item.issubset(self.transactions[i])] else: return [i for i in range(len(self.transactions)) if item in self.transactions[i]] def freq(self, item): return len(self.supportSet(item)) def __len__(self): return len(self.transactions) def __str__(self): return "Transaction data with {} transactions over {} items".format(len(self.transactions), len(self.items)) def read_file(filename): data = [] G = None with open(filename, 'r') as f: for line in f: line = line.strip() parts = line.split() if len(parts) == 0: # Empty line continue if parts[0] == 't': # A new graph G = Graph() data.append(G) elif parts[0] == 'v': # A new vertex G.addVertex(int(parts[1]), parts[2]) elif parts[0] == 'e': # A new edge G.addEdge(int(parts[1]), int(parts[2])) elif parts[0] == '#': # A comment continue else: # An error raise ValueError("Unknown input: " + line) return data def graph_to_transaction(graph): t = set() for e in graph.edges: item = (graph.nodes[e[0]], graph.nodes[e[1]]) t.add(item) return t def graph_data_to_transaction_data(graphData): data = TransactionData() for G in graphData: data.addTransaction(graph_to_transaction(G)) return data def apriori(transData, threshold, feasible=None): import itertools if feasible == None: feasible = lambda x,y: True freqItemSets = {} feasibleFreqItemSets = set() candidates = set([frozenset([item]) for item in transData.items if transData.freq(item) > threshold]) feasibleCandidates = candidates while len(candidates) > 0: verb_print("\t{} candidates\t{} feasible candidates\n".format(len(candidates), len(feasibleCandidates)), 3) # Add candidates to the set of freq itemsets freqItemSets.update(zip(candidates, [transData.freq(c) for c in candidates])) # Add feasible candidates to the correct set feasibleFreqItemSets.update(feasibleCandidates) # Iterator over potential candidates candIter = itertools.combinations(candidates, 2) # New candidates newCandidates = set([frozenset(c[0].union(c[1])) for c in candIter if len(c[0].union(c[1])) == len(c[0]) + 1 and transData.freq(c[0].union(c[1])) > threshold]) # New feasible candidates candIter = itertools.combinations(feasibleCandidates, 2) feasibleCandidates = set([frozenset(c[0].union(c[1])) for c in candIter if feasible(c[0], c[1]) and frozenset(c[0].union(c[1])) in newCandidates]) candidates = newCandidates return (freqItemSets, feasibleFreqItemSets) def extract_labels(itemset): labels = set() for item in itemset: labels.update(item) return labels def is_connected(p, q): p_labels = extract_labels(p) q_labels = extract_labels(q) return not p_labels.isdisjoint(q_labels) def find_maximal(itemsets): maximals = set() for itemset in itemsets: subsets = set() is_subset = False for m in maximals: if itemset.issuperset(m): subsets.add(m) elif itemset.issubset(m): is_subset = True if len(subsets) > 0: maximals.difference_update(subsets) maximals.add(itemset) elif not is_subset: maximals.add(itemset) return maximals def verb_print(message, level=0): if verbosity >= level: sys.stderr.write(message) # My "open" to open either stdout or a file # From https://stackoverflow.com/questions/17602878/how-to-handle-both-with-open-and-sys-stdout-nicely @contextlib.contextmanager def smart_open(filename=None): if filename and filename != '-': fh = open(filename, 'w') else: fh = sys.stdout try: yield fh finally: if fh is not sys.stdout: fh.close() def print_output(ffi, fi, file=None): with smart_open(file) as f: for itemset in ffi: freq = fi[itemset] f.write("{}".format(freq)) for item in itemset: f.write(" {}--{}".format(item[0], item[1])) f.write('\n') def main(): import argparse parser = argparse.ArgumentParser(description="A program for mining frequent subgraphs using frequent itemsets") parser.add_argument("file", help="Input file") parser.add_argument("-o", "--output", help="Output file") parser.add_argument("-O", "--maxfis-output", help="File to output all maximal (not-necessarily-feasible) itemsets") parser.add_argument("-t", "--threshold", help="Minimum support threshold", type=float, required=True) parser.add_argument("-v", "--verbosity", help="Set verbosity level", action="count", default=0) args = parser.parse_args() global verbosity verbosity=args.verbosity if args.threshold < 0: verb_print("Invalid threshold value {}".format(parser.threshold)) return verb_print("Reading graph data... ", 1) graphData = read_file(args.file) verb_print("done!\n", 1) verb_print("Read {} graphs\n".format(len(graphData)), 2) if verbosity >= 5: dist_labels = set() for g in graphData: dist_labels.update(g.labels.keys()) verb_print("There are {} distinct vertex labels\n".format(len(dist_labels)), 5) if args.threshold < 1: threshold = int(args.threshold * len(graphData)) else: threshold = int(args.threshold) verb_print("Converting to transaction data... ", 1) transData = graph_data_to_transaction_data(graphData) verb_print("done\n", 1) verb_print("Transaction data has {} transactions and {} items\n".format(len(transData), len(transData.items)), 2) if verbosity >= 5: nnz = sum([len(t) for t in transData.transactions]) verb_print("\tand {} nonzeros\n".format(nnz), 5) verb_print("Calculating Apriori with threshold {} of {}...\n".format(threshold, len(transData)), 1) (fis, ffis) = apriori(transData, threshold, feasible=is_connected) verb_print("done!\n", 1) verb_print("Found {} frequent itemsets, out of which {} are feasible\n".format(len(fis), len(ffis)), 2) maxFfis = find_maximal(ffis) verb_print("\tand {} of the feasible ones are maximal\n".format(len(maxFfis)), 2) print_output(maxFfis, fis, args.output) if args.maxfis_output != None: maxFis = find_maximal(fis.keys()) verb_print("Found {} maximal (not-necessarily-feasible) frequent itemsets\n".format(len(maxFis)), 2) print_output(maxFis, fis, args.maxfis_output) if __name__ == "__main__": main()