// The MIT License (MIT) // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt // Permission is hereby granted, free of charge, to any person obtaining a copy of // this software and associated documentation files (the "Software"), to deal in // the Software without restriction, including without limitation the rights to // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of // the Software, and to permit persons to whom the Software is furnished to do so, // subject to the following conditions: // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. package bbloom import ( "bytes" "encoding/json" "errors" "math" "sync" "unsafe" ) // helper var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128} func getSize(ui64 uint64) (size uint64, exponent uint64) { if ui64 < uint64(512) { ui64 = uint64(512) } size = uint64(1) for size < ui64 { size <<= 1 exponent++ } return size, exponent } func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) { size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2) locs := math.Ceil(float64(0.69314718056) * size / numEntries) return uint64(size), uint64(locs) } var ErrUsage = errors.New("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(ratio_of_false_positives)) i.e. New(float64(1000), float64(0.03))") var ErrInvalidParms = errors.New("One of parameters was outside of allowed range") // New // returns a new bloomfilter func New(params ...float64) (bloomfilter *Bloom, err error) { var entries, locs uint64 if len(params) == 2 { if params[0] < 0 || params[1] < 0 { return nil, ErrInvalidParms } if params[1] < 1 { entries, locs = calcSizeByWrongPositives(math.Max(params[0], 1), params[1]) } else { entries, locs = uint64(params[0]), uint64(params[1]) } } else { return nil, ErrUsage } size, exponent := getSize(uint64(entries)) bloomfilter = &Bloom{ sizeExp: exponent, size: size - 1, setLocs: locs, shift: 64 - exponent, bitset: make([]uint64, size>>6), } return bloomfilter, nil } // NewWithBoolset // takes a []byte slice and number of locs per entry // returns the bloomfilter with a bitset populated according to the input []byte func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter *Bloom) { bloomfilter, err := New(float64(len(*bs)<<3), float64(locs)) if err != nil { panic(err) // Should never happen } ptr := uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) for _, b := range *bs { *(*uint8)(unsafe.Pointer(ptr)) = b ptr++ } return bloomfilter } // bloomJSONImExport // Im/Export structure used by JSONMarshal / JSONUnmarshal type bloomJSONImExport struct { FilterSet []byte SetLocs uint64 } // // Bloom filter type Bloom struct { Mtx sync.RWMutex bitset []uint64 sizeExp uint64 size uint64 setLocs uint64 shift uint64 content uint64 } func (bl *Bloom) ElementsAdded() uint64 { return bl.content } // <--- http://www.cse.yorku.ca/~oz/hash.html // modified Berkeley DB Hash (32bit) // hash is casted to l, h = 16bit fragments // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) { // hash := uint64(len(*b)) // for _, c := range *b { // hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash // } // h = hash >> bl.shift // l = hash << bl.shift >> bl.shift // return l, h // } // Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm() // https://131002.net/siphash/ // siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash // Add // set the bit(s) for entry; Adds an entry to the Bloom filter func (bl *Bloom) Add(entry []byte) { bl.content++ l, h := bl.sipHash(entry) for i := uint64(0); i < (*bl).setLocs; i++ { bl.set((h + i*l) & (*bl).size) } } // AddTS // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry func (bl *Bloom) AddTS(entry []byte) { bl.Mtx.Lock() defer bl.Mtx.Unlock() bl.Add(entry[:]) } // Has // check if bit(s) for entry is/are set // returns true if the entry was added to the Bloom Filter func (bl *Bloom) Has(entry []byte) bool { l, h := bl.sipHash(entry) res := true for i := uint64(0); i < bl.setLocs; i++ { res = res && bl.isSet((h+i*l)&bl.size) // Branching here (early escape) is not worth it // This is my conclusion from benchmarks // if !res { // return false // } } return res } // HasTS // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry func (bl *Bloom) HasTS(entry []byte) bool { bl.Mtx.RLock() defer bl.Mtx.RUnlock() return bl.Has(entry[:]) } // AddIfNotHas // Only Add entry if it's not present in the bloomfilter // returns true if entry was added // returns false if entry was allready registered in the bloomfilter func (bl *Bloom) AddIfNotHas(entry []byte) (added bool) { l, h := bl.sipHash(entry) contained := true for i := uint64(0); i < bl.setLocs; i++ { prev := bl.getSet((h + i*l) & bl.size) contained = contained && prev } if !contained { bl.content++ } return !contained } // AddIfNotHasTS // Tread safe: Only Add entry if it's not present in the bloomfilter // returns true if entry was added // returns false if entry was allready registered in the bloomfilter func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) { bl.Mtx.Lock() defer bl.Mtx.Unlock() return bl.AddIfNotHas(entry[:]) } // Clear // resets the Bloom filter func (bl *Bloom) Clear() { bl.Mtx.Lock() defer bl.Mtx.Unlock() for i, _ := range (*bl).bitset { bl.bitset[i] = 0 } bl.content = 0 } // Set // set the bit[idx] of bitsit func (bl *Bloom) set(idx uint64) { ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)) *(*uint8)(ptr) |= mask[idx%8] } func (bl *Bloom) getSet(idx uint64) bool { ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)) res := *(*uint8)(ptr)&mask[idx%8] > 0 *(*uint8)(ptr) |= mask[idx%8] return res } // IsSet // check if bit[idx] of bitset is set // returns true/false func (bl *Bloom) isSet(idx uint64) bool { ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)) return *(*uint8)(ptr)&mask[idx%8] > 0 } // JSONMarshal // returns JSON-object (type bloomJSONImExport) as []byte func (bl *Bloom) JSONMarshal() ([]byte, error) { bl.Mtx.RLock() defer bl.Mtx.RUnlock() bloomImEx := bloomJSONImExport{} bloomImEx.SetLocs = uint64(bl.setLocs) bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3) ptr := uintptr(unsafe.Pointer(&bl.bitset[0])) for i := range bloomImEx.FilterSet { bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(ptr)) ptr++ } data, err := json.Marshal(bloomImEx) return data, err } // JSONUnmarshal // takes JSON-Object (type bloomJSONImExport) as []bytes // returns bloom32 / bloom64 object func JSONUnmarshal(dbData []byte) *Bloom { bloomImEx := bloomJSONImExport{} json.Unmarshal(dbData, &bloomImEx) buf := bytes.NewBuffer(bloomImEx.FilterSet) bs := buf.Bytes() bf := NewWithBoolset(&bs, bloomImEx.SetLocs) return bf } func (bl *Bloom) FillRatio() float64 { count := uint64(0) for _, b := range bl.bitset { count += uint64(popcount(b)) } return float64(count) / float64(bl.size+1) } func popcount(x uint64) uint { const ( m1 = 0x5555555555555555 //binary: 0101... m2 = 0x3333333333333333 //binary: 00110011.. m4 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ... h01 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3... ) x -= (x >> 1) & m1 //put count of each 2 bits into those 2 bits x = (x & m2) + ((x >> 2) & m2) //put count of each 4 bits into those 4 bits x = (x + (x >> 4)) & m4 //put count of each 8 bits into those 8 bits return uint((x * h01) >> 56) } // // alternative hashFn // func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) { // h64 := fnv.New64a() // h64.Write(*b) // hash := h64.Sum64() // h = hash >> 32 // l = hash << 32 >> 32 // return l, h // } // // // <-- http://partow.net/programming/hashfunctions/index.html // // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3, // // under the topic of sorting and search chapter 6.4. // // modified to fit with boolset-length // func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) { // hash := uint64(len(*b)) // for _, c := range *b { // hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c) // } // h = hash >> bl.shift // l = hash << bl.sizeExp >> bl.sizeExp // return l, h // }