# Find the most divisive fingerprint features. # For details see "Optimizing substructure keys" at # http://dalkescientific.com/writings/diary/archive/2012/06/11/optimizing_substructure_keys.html # Written by Andrew Dalke import sys from collections import defaultdict import hashlib import time import inverted_index as inverted_index_lib class LoadUniqueFingerprints(object): def __init__(self): # Map from a feature string like "ccO" to itself (string interning) self.feature_to_id = {} # Map from an integer feature id to a set of record ids self.inverted_indices = defaultdict(set) # Keep track of all of the ids added to the inverted indices # (Ignores duplicate ids) # Don't allow duplicate fingerprints self.seen_fingerprint = set() def get_feature_id(self, feature): # Use the feature string itself as the feature id # This lookup does string interning. try: return self.feature_to_id[feature] except KeyError: self.feature_to_id[feature] = feature return feature def add_record(self, id, feature_ids): # Check for duplicate signatures s = ",".join(feature_ids) signature = hashlib.sha256(s).digest() if signature in self.seen_fingerprint: return self.seen_fingerprint.add(signature) # Make the inverted indices for feature_id in feature_ids: self.inverted_indices[feature_id].add(id) # The data set is avilable from http://dalkescientific.com/inverted_index_benchmark.7z filenames = ["Compound_000000001_000025000.counts.fpc", "Compound_000025001_000050000.counts.fpc", "Compound_000050001_000075000.counts.fpc", "Compound_000075001_000100000.counts.fpc", "Compound_000100001_000125000.counts.fpc", "Compound_000125001_000150000.counts.fpc", "Compound_000150001_000175000.counts.fpc", "Compound_000175001_000200000.counts.fpc", "Compound_000200001_000225000.counts.fpc", "Compound_000225001_000250000.counts.fpc", ] def load_dataset(size_threshold): # removes duplicate fingerprints index_db = LoadUniqueFingerprints() for filename in filenames: print "Loading", filename with open(filename) as input_file: inverted_index_lib.add_dataset(index_db, input_file) inverted_indices = index_db.inverted_indices # remove indices which are too small # Note: the size_threshold here has a somewhat different meaning than in main() # I don't think the distinction is that important. print len(inverted_indices), "features before pruning threshold", size_threshold for feature, feature_set in inverted_indices.items(): if len(feature_set) < size_threshold: del inverted_indices[feature] print len(inverted_indices), "features after pruning" # remove indices which are identical to other indices signatures = defaultdict(list) for feature, feature_set in inverted_indices.iteritems(): s = ",".join(map(str, sorted(feature_set))) signature = hashlib.sha256(s).digest() signatures[signature].append( (len(feature), feature) ) for duplicates in signatures.values(): if len(duplicates) > 1: duplicates.sort() for n, feature in duplicates[1:]: del inverted_indices[feature] print len(inverted_indices), "features after de-duplication" all_ids = set.union(*inverted_indices.values()) return all_ids, inverted_indices def main(): size_threshold = 10 all_ids, inverted_indices = load_dataset(size_threshold) bins = [all_ids] num_records = len(all_ids) bitno = 0 while 1: print "Status:", len(inverted_indices), "features", num_records, "records", \ len(bins), "bins", "largest bin", max(map(len, bins)) most_divisive = None best_score = None redundant_features = [] start_time = time.time() msg = "" # Sort so the largest bins are first. This improves the # chance of doing short-circuit evaluation during scoring # and improves the overall performance. bins.sort(key=len, reverse=True) items = inverted_indices.items() items.sort(reverse=True, key=lambda x: len(x[1])) for featureno, (feature, feature_set) in enumerate(items): if featureno % 100 == 99: msg = "%d/%d" % (featureno+1, len(inverted_indices)) sys.stderr.write("\r" + msg) # How well does this feature partition the data set? is_redundant = True score = 0 early_termination = False if bitno >= 40 and bitno % 20 == 0: # Do a bit of redundant fragment removal every once in a while for bin in bins: num_on = len(bin & feature_set) num_off = len(bin) - num_on if num_on and num_off: is_redundant = False score += (num_on-num_off)**2 else: for bin in bins: if best_score is not None and score > best_score: early_termination = True break num_on = len(bin & feature_set) num_off = len(bin) - num_on if num_on and num_off: is_redundant = False score += (num_on-num_off)**2 if not early_termination: if is_redundant: redundant_features.append(feature) else: if most_divisive is None: most_divisive = (score, feature) best_score = score else: most_divisive = min(most_divisive, (score, feature)) best_score = most_divisive[0] sys.stderr.write("\r" + " "*len(msg) + "\r") end_time = time.time() # Split the bins based on the most divisive feature found print bitno, "Most divisive", most_divisive, "%.1f seconds" % (end_time-start_time) bitno += 1 if most_divisive is None: break n, divisive_feature = most_divisive divisive_feature_set = inverted_indices.pop(divisive_feature) new_bins = [] new_num_records = 0 for bin in bins: on_subset = bin & divisive_feature_set if len(on_subset) >= size_threshold: new_bins.append(on_subset) off_subset = bin - divisive_feature_set if len(off_subset) >= size_threshold: new_bins.append(off_subset) if not new_bins: break bins = new_bins num_records = sum(map(len, bins)) # Redundant features because they can never split any bins. # Remove them from future consideration for redundant_feature in redundant_features: del inverted_indices[redundant_feature] if redundant_features: print "Removed", len(redundant_features), "redundant features" if __name__ == "__main__": main()