Giter VIP home page Giter VIP logo

bloomfilter's Introduction

布隆过滤器

什么是布隆过滤器

**布隆过滤器本质上是一个数据结构,它可以用来判断某个元素是否在集合内,具有运行快速,内存占用小的特点。 而高效插入和查询的代价就是,Bloom Filter 是一个基于概率的数据结构:它只能告诉我们一个元素绝对不在集合内或可能在集合内。

A Bloom filter is a data structure designed to tell you, rapidly and memory-efficiently, whether an element is present in a set. The price paid for this efficiency is that a Bloom filter is a probabilistic data structure: it tells us that the element either definitely is not in the set or may be in the set.**

过滤器类型 FilterTypes

类型 Type 特点 Feature
MemoryFilter 基于内存的过滤器,可以通过将字节内容加载到内存后进行操作 Filter based on memory. All the operations would be done in memory.
CachedRedisFilter 基于MemoryFilter, 在使用Write方法时上传数据到Redis Filter based on MemoryFilter, the result will be uploaded to redis server when the Write function is called.
SQLFilter 基于MemoryFilter, 在使用Write方法时上传数据到Sql数据库。 Filter based on MemoryFilter, the result will be uploaded to sql database when the Write function is called.
InteractiveRedisFilter 交互式的Redis布隆过滤器,使用PushExists方法时,都将实时与Redis服务器通讯。 Interactive filter which will update the redis BitMap through SetBit and GetBit by Push and Exists function.

使用方法 Usage

MemoryFilter

func TestMemoryFilter(t *testing.T) {
	// Initial a memory filter
	memFilter := bloomfilter.NewMemoryFilter(make([]byte, 10240), bloomfilter.DefaultHash...)
	// Push 2000-3000 numbers to the filter.
	// 把2000-3000的数字压入过滤器
	fillNums(memFilter, 2000, 3000)
	// Check whether 2500-3000 and 3001-3500 exist in the filter or not.
	// 查看2500-3000,以及3001-3500是否存在于过滤器中
	for i := 2500; i < 3500; i++ {
		t.Logf("%d, %t", i, memFilter.Exists([]byte(strconv.Itoa(i))))
	}
}

CachedRedisFilter

var options = &redis.Options{
	Addr:     "192.168.20.101:6379",
	Username: "",
	Password: "",
	DB:       0,
}

var key = "test"

func TestRedisCachedFilter(t *testing.T) {
	cli := redis.NewClient(options)
	cachedFilter, err := bloomfilter.NewRedisFilter(context.TODO(), cli, bloomfilter.RedisFilterType_Cached, key, 10240, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	t.Log("误判率,False positive rate:", bloomfilter.GetFlasePositiveRate(10240 * 8, 3, 2)) // 5.364385288193686e-11
	fillNums(cachedFilter, 250, 300)
	t.Log(cachedFilter.Exists([]byte(strconv.Itoa(290)))) // true
	t.Log(cachedFilter.Exists([]byte(strconv.Itoa(299)))) // true
	t.Log(cachedFilter.Exists([]byte(strconv.Itoa(350)))) // false
	// must use write to save the data to redis.
	// 必须使用write方法将数据全部提交到redis服务器
	cachedFilter.Write()
}

SQLFilter

var sqlDSN = "user:password@tcp(ip:port)/database?charset=utf8mb4&parseTime=True&loc=Local"

func TestSqlFilter(t *testing.T) {
	// Init gorm.DB
	db, err := gorm.Open(mysql.Open(sqlDSN))
	if err != nil {
		t.Fatal(err)
	}
	// Init SQLFilter
	sqlFilter, err := bloomfilter.SqlFilter(db, "test", 1000, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	// Push 250-300 numbers to the filter.
	// 把250-300的数字压入过滤器
	fillNums(sqlFilter, 250, 300)
	sqlFilter.Write()
}

func TestSqlFilterExist(t *testing.T) {
	db, err := gorm.Open(mysql.Open(sqlDSN))
	if err != nil {
		t.Fatal(err)
	}
	sqlFilter, err := bloomfilter.SqlFilter(db, "test", 1000, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	// 280-300 should exist in filter, and 301-320 doesn't.
	// 280-300应该在过滤器中,而301-320不应该在。
	for i:=280;i<320;i++{
		t.Logf("%d: %t", i, sqlFilter.Exists([]byte(strconv.Itoa(i))))
	}
}

InteractiveRedisFilter

func TestInteractiveFilter(t *testing.T) {
	cli := redis.NewClient(options)
	interactiveFilter, err := bloomfilter.NewRedisFilter(context.TODO(), cli, bloomfilter.RedisFilterType_Interactive, key, 10240, bloomfilter.DefaultHash...)
	if err != nil {
		t.Fatal(err)
	}
	fillNums(interactiveFilter, 250, 300)

	anotherFilter, _ := bloomfilter.NewRedisFilter(context.TODO(), cli, bloomfilter.RedisFilterType_Interactive, key, 10240, bloomfilter.DefaultHash...)

	t.Log(interactiveFilter.Exists([]byte(strconv.Itoa(290)))) // true
	t.Log(anotherFilter.Exists([]byte(strconv.Itoa(290))))     // true

	t.Log(interactiveFilter.Exists([]byte(strconv.Itoa(299)))) // true
	t.Log(anotherFilter.Exists([]byte(strconv.Itoa(299))))     // true

	t.Log(interactiveFilter.Exists([]byte(strconv.Itoa(320)))) // false
	t.Log(anotherFilter.Exists([]byte(strconv.Itoa(320))) )    // false
}

计算误差 Calculate the false positive rate

func TestMemFalsePositiveRate(t *testing.T) {
	memFilter := bloomfilter.NewMemoryFilter(make([]byte, 10240), bloomfilter.DefaultHash...).(*memory.Filter)
	testFalsePositiveRate(t, memFilter, 10240 * 8, 1000, 3, 100000000)
	//=== RUN   TestMemFalsePositiveRate
	//    common.go:37: 理论误判率 Theoretical false positive rate: 0.0018230817954481005
	//    common.go:49: 实际误判率 Real false positive rate:0.00047424474244742447
	//--- PASS: TestMemFalsePositiveRate (7.43s)
	//PASS
}

计算信息熵,用来测试哈希函数是否是均匀分布的。 Calculate the Information Entropy to test if the hash function is uniformly distributed.

func TestIsHashFuncUniformlyDistributed(t *testing.T) {
	for _, f := range bloomfilter.DefaultHash{
		t.Log(bloomfilter.CalculateInformationEntropy(f))
	}
	//不满足的hash,已舍弃
	t.Log("abandoned hash function:")
	t.Log(bloomfilter.CalculateInformationEntropy(func() hash.Hash64{return crc64.New(crc64.MakeTable(crc64.ISO))}))
}

联系方式 Contact

讲解视频 https://www.bilibili.com/video/BV1bJ411t7A8/ 邮箱 [mailto:[email protected]]

End

bloomfilter's People

Contributors

hugh2632 avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar

bloomfilter's Issues

结果不准确

基于内存的模式添加500万数据,拿任何数据,碰撞得出的结果都是存在。

func TestMemoryFilter(t *testing.T) {
	bf := bloomfilter.NewMemoryFilter(make([]byte, 10240), bloomfilter.DefaultHash[:5]...)
	bf.Clear()

	//file, err := os.Open("./1.txt")
	file, err := os.Open("/Users/pandy/go/src/go-sdk/data/domain.txt")
	defer file.Close()
	if err != nil {
		fmt.Printf("Error: %s\n", err)
		return
	}

	reader := bufio.NewReader(file)
	countWrite := 0
	for {
		a, _, c := reader.ReadLine()
		if c == io.EOF {
			break
		}

		bf.Push(a)
		countWrite++
	}

	bf.Write()
	t.Log("write num:", countWrite)

	// 入数据之后 进行碰撞检测
	countRead := 0
	//file2, err := os.Open("/Users/pandy/go/src/go-sdk/data/domain_read.txt")
	file2, err := os.Open("./2.txt")
	defer file.Close()
	if err != nil {
		fmt.Printf("Error: %s\n", err)
		return
	}

	reader2 := bufio.NewReader(file2)
	for {
		a, _, c := reader2.ReadLine()
		if c == io.EOF {
			break
		}

		if bf.Exists(a) {
			countRead++
		} else {
			t.Log("not hash", string(a))
		}
	}

	t.Log("hashed num:", countRead)

	bf.Clear()
	bf.Close()
}

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.