Giter VIP home page Giter VIP logo

Comments (4)

muyannian avatar muyannian commented on May 9, 2024

这种问题貌似前人都已经实现了,看这里
http://www.searchtb.com/2011/07/doclist-compress.html?spm=0.0.0.0.LBo3qP
http://www.searchtb.com/2013/04/pfordelta.html

from mdrill.

muyannian avatar muyannian commented on May 9, 2024

发现lucene在4.0的版本里提供了这种功能(我们决定采用这种方式,测试了下解压缩速度很理想)
参考地址 http://svn.apache.org/repos/asf/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/util/pfor2/PForDelta.java

from mdrill.

muyannian avatar muyannian commented on May 9, 2024

已发布

from mdrill.

muyannian avatar muyannian commented on May 9, 2024

��针对不同的数据列 采用不同的压缩算法,常规列使用PForDelta,像男女,日期采用RepeatCompress
的实现代码如下RepeatCompress

package com.alimama.mdrill.buffer;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;

public class RepeatCompress {

public static class RepeatCompressRtn{
    public int[] bytes;
    public int index;
}

public static void main(String[] args) {
    long t1=System.currentTimeMillis();
    int[] test=new int[10240];
    for(int j=0;j<test.length;j++)
    {
        test[j]=j/100;

    }

    for(int i=0;i<1;i++)
    {

        RepeatCompressRtn compress2=compress(test, test.length);

        int[] compress=new int[compress2.index];
        for(int k=0;k<compress.length;k++)
        {
            compress[k]=compress2.bytes[k];
        }

// System.out.println(Arrays.toString(test));
// System.out.println(Arrays.toString(compress));
int[] result=decompress(compress);

        System.out.println(Arrays.equals(result, test)+","+compress.length+","+test.length+","+compress.length*100d/test.length);
    }
    System.out.println(System.currentTimeMillis()-t1);

// long t1=System.currentTimeMillis();
//
// for(int k=0;k<10;k++)
// {
// int[] test=new int[10240];
// for(int j=0;j<test.length;j++)
// {
// test[j]=(int) (100*Math.random());
//
// }
// RepeatCompressRtn rtn=groupNumEncode(test, test.length);
// int[] compress=new int[rtn.index];
// for(int i=0;i<compress.length;i++)
// {
// compress[i]=rtn.bytes[i];
// }
//
//
// int[] result=groupNumDecode(compress);
// }
//
// System.out.println(System.currentTimeMillis()-t1);

}

public static class groupCount implements Comparable<groupCount>{
    int cnt;
    @Override
    public String toString() {
        return "[cnt=" + cnt + ", val=" + val + "]";
    }
    int val;
    @Override
    public int compareTo(groupCount o) {
        return (this.cnt<o.cnt ? 1 : (this.cnt==o.cnt ? 0 : -1));
    }
}

private static RepeatCompressRtn groupNumEncode(final int[] inBlock, int blockSize)
{
    HashMap<Integer,groupCount> groupCount=new HashMap<Integer,groupCount>(blockSize);
     for(int i=0;i<blockSize;i++)
      {
          int val=inBlock[i];
          groupCount g=groupCount.get(val);
          if(g==null)
          {
              g=new groupCount();
              g.cnt=0;
              g.val=val;
              groupCount.put(val,g);
          }
          g.cnt++;
      }

     ArrayList<groupCount> sortlist=new ArrayList<RepeatCompress.groupCount>();
     sortlist.addAll(groupCount.values());
     Collections.sort(sortlist );

        HashMap<Integer,Integer> repeat=new HashMap<Integer,Integer>(blockSize);

     int[] groupNum=new int[blockSize<<1];
        int groupNumIndex=1;
        for(groupCount g:sortlist)
        {
              int groupNumval=groupNumIndex++;
              repeat.put(g.val, groupNumval);
              groupNum[groupNumval]=g.val;
        }




    RepeatCompressRtn rtn=new RepeatCompressRtn();
    rtn.bytes=new int[blockSize<<2];
    rtn.index=0;
     for(int i=0;i<blockSize;i++)
      {
          int val=inBlock[i];
          Integer g=repeat.get(val);
          rtn.bytes[rtn.index++]=g;
      }


     for(int i=1;i<groupNumIndex;i++)
     {
          rtn.bytes[rtn.index++]=groupNum[i];
     }
      rtn.bytes[rtn.index++]=groupNumIndex-1;

      return rtn;
}

private static int[] groupNumDecode(final int[] inBlock)
{
    int lastNum=inBlock.length-1;
    int[] groupNum=new int[inBlock[lastNum]+1];

    int start=lastNum-inBlock[lastNum];
    for(int i=start;i<lastNum;i++)
    {
        groupNum[i-start+1]=inBlock[i];

    }


    int[] rtn=new int[start];

    for(int i=0;i<start;i++)
    {
        rtn[i]=groupNum[inBlock[i]];
    }
    return rtn;
}


  public static RepeatCompressRtn compress(final int[] inBlock, int blockSize)
  {
      RepeatCompressRtn group=groupNumEncode(inBlock, blockSize);

// RepeatCompressRtn group=new RepeatCompressRtn();
// group.bytes=inBlock;
// group.index=blockSize;

      RepeatCompressRtn rtn=new RepeatCompressRtn();
      rtn.bytes=new int[group.index<<2];
      rtn.index=0;
      int last=Integer.MIN_VALUE;
      int cnt=0;
      for(int i=0;i<group.index;i++)
      {
          int val=group.bytes[i];
          if(last!=val)
          {
              rtn.bytes[rtn.index++]=val<<1;

// System.out.println(i+"@"+val+"@"+Arrays.toString(rtn.bytes));

              last=val;
              cnt=1;
          }else{
              cnt++;
              if(cnt==2)
              {

// System.out.println(i+"@"+"22>"+val+"@"+Arrays.toString(rtn.bytes));
rtn.bytes[rtn.index-1]=(val<<1)+1;
rtn.bytes[rtn.index++]=2;
// System.out.println(i+"@"+"44>"+val+"@"+Arrays.toString(rtn.bytes));

              }else{

// System.out.println(i+"@"+"33"+val+"@"+Arrays.toString(rtn.bytes));

                  rtn.bytes[rtn.index-1]++;
              }
          }
      }
      rtn.bytes[rtn.index++]=group.index;
      return rtn;
  }

  public static int[] decompress(final int[] compress)
  {
      int groupsize=compress.length-1;
      int[] rtn=new int[compress[groupsize]];
      int index=0;
      for(int i=0;i<groupsize;)
      {
          int val=compress[i++];
          int num=val>>1;
          int type=val-(num<<1);
          int cnt=1;
          if(type==1)
          {
              cnt=compress[i++];
          }
          for(int j=0;j<cnt;j++)
          {
              rtn[index++]=num; 
          }
      }

      return groupNumDecode(rtn);
  }

}

from mdrill.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.