Giter VIP home page Giter VIP logo

hdt-node's Introduction

HDT for Node.js

npm version Build Status Dependency Status devDependency Status

HDT (Header Dictionary Triples) is a compressed format for RDF triples.
The hdt npm package for Node.js brings fast access to HDT files through C bindings.

Usage

Importing the library

Install the library by adding hdt to your package.json or executing

$ npm install hdt

Then require the library.

const hdt = require('hdt');

Opening and closing an HDT document

Open an HDT document with hdt.fromFile, which takes a filename as argument and returns the HDT document in a promise. Close the document with close.

hdt.fromFile('./test/test.hdt').then(function(hdtDocument) {
  // Don't forget to close the document when you're done
  return hdtDocument.close();
});

Searching for triples matching a pattern

Search for triples with search, which takes subject, predicate, object, and options arguments. Subject, predicate, and object can be IRIs or literals, represented as simple strings. If any of these parameters is null or a variable, it is considered a wildcard. Optionally, an offset and limit can be passed in an options object, selecting only the specified subset.

The promise returns an object with an array of triples, the total number of expected triples for the pattern, and whether the total count is an estimate or exact.

var doc;
hdt.fromFile('./test/test.hdt')
  .then(function(hdtDocument) {
    doc = hdtDocument;
    return doc.searchTriples('http://example.org/s1', null, null, { offset: 0, limit: 10 })
  })
  .then(function(result) {
    console.log('Approximately ' + result.totalCount + ' triples match the pattern.');
    result.triples.forEach(function (triple) { console.log(triple); });
    return doc.close();
  });

Counting triples matching a pattern

Retrieve an estimate of the total number of triples matching a pattern with count, which takes subject, predicate, and object arguments.

var doc;
hdt.fromFile('./test/test.hdt')
  .then(function(hdtDocument) {
    doc = hdtDocument;
    return doc.countTriples('http://example.org/s1', null, null);
  })
  .then(function(result) {
    console.log('Approximately ' + result.totalCount + ' triples match the pattern.');
    return doc.close()
  });

Search terms starting with a prefix

Find terms (literals and IRIs) that start with a given prefix.

hdtDocument.searchTerms({ prefix: 'http://example.org/', limit: 100, position: 'object' })
  .then(function(suggestions) {
    console.log('Found ' + suggestions.length + ' suggestions');
    return hdtDocument.close();
  });

Fetching unique predicates for a subject and/or an object

Find all unique predicates for a given subject argument.

hdtDocument.searchTerms({ subject: 'http://example.org/s1' limit: 10, position: 'predicate' })
  .then(function(terms) {
    console.log('Found ' + terms.length + ' unique predicates');
    return hdtDocument.close();
  });

Find all unique predicates for a given object argument.

hdtDocument.searchTerms({ object: 'http://example.org/o1', limit: 10, position: 'predicate' })
  .then(function(terms) {
    console.log('Found ' + terms.length + ' unique predicates');
    return hdtDocument.close();
  });

Find all unique predicates for given subject and object arguments.

hdtDocument.searchTerms({ subject: 'http://example.org/s1', object: 'http://example.org/o1', limit: 10, position: 'predicate' })
  .then(function(terms) {
    console.log('Found ' + terms.length + ' unique predicates');
    return hdtDocument.close();
  });

Searching literals containing a substring

In an HDT file that was generated with an FM index, you can search for literals that contain a certain substring.

var doc;
hdt.fromFile('./test/test.hdt')
  .then(function(hdtDocument) {
    doc = hdtDocument;
    return doc.searchLiterals('b', { offset: 0, limit: 5 });
  })
  .then(function(result) {
    console.log('Approximately ' + result.totalCount + ' literals contain the pattern.');
    result.literals.forEach(function (literal) { console.log(literal); });
    return doc.close();
  });

Reading the header

HDT supports reading the header as string using document.readHeader(). The example below reads the header as string, and parses the header using the N3.js library.

var N3 = require('n3');
var doc;
var parser = N3.Parser();
hdt.fromFile('./test/test.hdt')
  .then(function(hdtDocument) {
    doc = hdtDocument;
    return doc.readHeader();
  })
  .then(function(header) {
    var triples = [];
    return new Promise(function(resolve, reject) {
      parser.parse(header, function(error, triple) {
        if (error) return reject(error);
        if (triple) return triples.push(triple);
        resolve(triples);
      });
    });
  })
  .then(function(triples) {
    console.log('Read triples from header:\n', triples);
  })
  .catch(function(e) {
    console.error(e);
  })

Changing the header

To replace header information of an HDT, use document.changeHeader(header, toFile), that returns an HDT document of the output file. The example below serializes an N3 triples object into an N-Triples string, and stores it in the header.

var N3 = require('n3');
var doc;
var outputFile = './out.hdt';

hdt.fromFile('./test/test.hdt')
 .then(function(hdtDocument) {
   doc = hdtDocument;
   return new Promise(function(resolve, reject) {
     var writer = N3.Writer({format: 'N-Triples'});
     writer.addTriple('http://example.org/cartoons#Tom',
                      'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
                      'http://example.org/cartoons#Cat');
     writer.end(function(error, triples) {
       if (error) return reject(error);
       resolve(triples);
     });
   });
 })
 .then(function(triples) {
      return doc.changeHeader(triples, outputFile);
  })
  .then(function(createdDocument) {
    return createdDocument.readHeader();
  })
  .then(function(result) {
    console.log('Wrote ' + result + ' to ' + outputFile);
  })
  .catch(function(e) {
    console.error(e);
  });

Standalone utility

The standalone utility hdt allows you to query HDT files from the command line.
To install system-wide, execute:

sudo npm install -g hdt

Specify queries as follows:

hdt dataset.hdt --query '?s ?p ?o' --offset 200 --limit 100 --format turtle

Replace any of the query variables by an IRI or literal to match specific patterns.

Build manually

To build the module from source, follow these instructions:

git clone https://github.com/RubenVerborgh/HDT-Node.git hdt
cd hdt
git submodule init
git submodule update
npm install
npm test

If you make changes to the source, do the following to rebuild:

node-gyp build && npm test

License

The Node.js bindings for HDT are written by Ruben Verborgh.

This code is copyrighted by Ruben Verborgh and released under the GNU Lesser General Public License. It uses the HDT C++ Library, released under the same license.

hdt-node's People

Contributors

dinikolop avatar jaw111 avatar laurensrietveld avatar rubensworks avatar rubenverborgh avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

hdt-node's Issues

Fetching distinct predicates, given a subject

As a follow-up of issue #25 and related PR #26, we are creating another PR which contains an enhancement of searchTerms method. Now, it will be able to fetch a list of distinct predicates given a subject.

Some examples of the proposed functionality are:

hdtDocument.searchTerms({ object: 'http://example.org/o1', limit: 10, position: 'predicate' }) // This is already possible
hdtDocument.searchTerms({ subject: 'http://example.org/s1', limit: 10, position: 'predicate' })
hdtDocument.searchTerms({ object: 'http://example.org/o1', subject: 'http://example.org/s1', limit: 10, position: 'predicate' }) 

User will still not be allowed to pass filter while passing object or/and subject as arguments.
In future, we can add support of more combinations, e.g. fetching distinct subjects given an objects etc.

@RubenVerborgh what do you think about it?

Stream implementation

Have you every considered a stream implementation, instead of a callback? Any interest in that direction?

Fetch distinct predicates

We're writing a PR now for fetching the distinct predicates for a given object. Essentially:

SELECT DISTINCT ?p WHERE {?s ?p <o>} LIMIT 10

Before we submit the PR, we should probably agree on the JS API interface.

Our suggestion:

  • Use the searchTerms function
  • It's currently used to do prefix-based search, and takes as argument: {limit: 1, position:'subject', prefix:'http'}
  • We suggestion adjusting searchTerms to support the following:
searchTerms({
  position:'predicate', 
  limit: 10,
  filter: {
    position: 'object',
    value: 'some-value'
  }
})

Our current implementation will only cover fetching the distinct predicates for a particular object. This might change in the future.

For simplicity, I suggest not to allow the user to pass a prefix and a filter argument at the same time

Does that sound like a plan?

Read/write the HDT header

hdt-cpp allows for both writing and reading the header. Let's add that to the HDT-Node api as well

.hdt.index files are incompatible with other HDT library versions

When I use this module in Node, I can't open the HDT file in HDT-it! afterwards. Also, if I first open the HDT file in HDT-it, and I try to open it in Node aftwards, it gives an error: [Error: Trying to read a LOGArray but data is not LogArray].

Is it because this module and HDT-it use different versions of the library? Is it possible to solve this somehow?

Error: process.stdout cannot be closed.

When executing hdt /data/dbpedia/data_3_9.hdt --query '?s ?p ?o' --offset 200 --limit 100 --format turtle , the following error occurs:

...
    <http://dbpedia.org/property/title> "Feel Good Hit of the Fall"@en, "Hammerhead"@en, "Intensify"@en.
events.js:141
      throw er; // Unhandled 'error' event
      ^

Error: process.stdout cannot be closed.
    at WriteStream.stdout.destroy.stdout.destroySoon (node.js:631:20)
    at WriteStream.onSocketFinish (net.js:196:17)
    at emitNone (events.js:67:13)
    at WriteStream.emit (events.js:166:7)
    at finishMaybe (_stream_writable.js:468:14)
    at afterWrite (_stream_writable.js:347:3)
    at doNTCallbackMany (node.js:461:18)
    at process._tickCallback (node.js:359:17)

Add RDFJS support (and update N3.js)

What your take on this?

I guess this would mean changing the C++ code to return a different object structure and change addTriples to addQuads in bin/hdt, but not much more.

Develop vs master branch

Seems the master branch is now ahead of the develop branch. Is this intended?
Reason I'm asking is we're going to submit a PR for a bug we found, and are wondering what the target branch should be

Point submodule to hdt-cpp develop branch

What about pointing the hdt-node develop branch to the hdt-cpp one? We'd have to modify the node-gyp bindings as the hdt-cpp dir structure changed as well.

Benefits:

  • The hdt-cpp develop branch will include an offsetting fix (part of rdfhdt/hdt-cpp#147)
  • We can easily release a new HDT-node version as soon as hdt-cpp is released (see rdfhdt/hdt-cpp#150)

Prefix search for literals/strings

Using HDT-it! browser, there's a prefix search for literals, when typing in the fields. It could be nice to allow in this module as well. Something along the lines of:

htdDocument.findLiteralsByPrefix(
  '"nice ',
  { offset: 0, limit: 10 },
  function(error, literals, totalCount) {
    console.log('Found approx. %s literals matching prefix pattern: %s', totalCount, literals.join(' '));
  }
);

Do you think it's possible to implement?

Current version does not install

There is a problem with the current version: the deps/libcds/includes directory is not part of the published npm package.
This omission is caused by this .gitignore file: https://github.com/rdfhdt/hdt-cpp/blob/develop/libcds/include/.gitignore

We'll submit a PR to hdt-cpp removing the gitignore and release a patch version. I'll leave it up to you to release the HDT-node patch version.

Reason we missed this issue is that we were testing against the HDT-Node git repo, that doesnt exclude files that npm publish would exclude. We could consider adding npm pack to the travis build, and trying the install the output of that command, but not sure whether that's worth the effort

Literal datatype notation in query

In HDT-Node, the query syntax for a datatyped-literal is "val"^^http://datatype. In the HDT-CLI, the syntax is "val"^^<http://datatype>.
I get the motivation for HDT-Node to remove the angular brackets, as regular IRIs don't have these as well and you'd like to present a consistent API to the user.
I get the impression though that this representation only makes it more confusing (at least it does for me), for the following reasons:

  • The HDT cli does require angular brackets
  • Parsing the literal using common libs is not possible, it's not standards compliant
  • The inclusion of quotes in a literal implies that we're seeing a standard serialized term. (I know that these quotes are required to be able to differentiate between IRIs and literals. Still, it's confusing ;) )

Any thoughts?

Generate HDT file

What do you think about binding the HDT generation procedures of the C++ code as well? Part of the roadmap, or do you consider it out of the scope of this lib?

Updating submodule to new hdt-cpp release

Shall we update the release to the newest release of hdt-cpp?
I've tested an updated hdt-cpp dep locally in hdt-node, and all works fine.

(could submit a pr for this, but guess if you update the submodule directly that saves both of us time)

Inconsistency while searching literal terms and triples with literals

When somebody wants to run searchTerms for a literal prefix that includes (part of) its datatype, they should specify the bracket(s). However, removing the bracket(s) will not return suggestions.

On the other hand, calling searchTriples for a literal with its bracketed datatype, you won't get back results. Calling with no brackets is the only way to get results:

Returns results?
doc.searchTerms({prefix: '"a"^^<ab', position: 'object'}) Yes
doc.searchTerms({prefix: '"a"^^ab', position: 'object'}) No
doc.searchTriples(null, null, '"a"^^<abc>') No
doc.searchTriples(null, null, '"a"^^abc') Yes

I think there should results in all cases in order to avoid this inconsistency. What do you think @RubenVerborgh ?
I can push at #37 , since I noticed during applying requested changes there.

(Bug) Install hdt 1.5.0 on Raspbian

Hello !
I'm trying to install ldf-server on a Raspberry Pi with Raspbian and I reached this error when installing the hdt lib.
npm install -g ldf-server or npm install -g hdt reach the same error.
I checked my gcc or g++ install and everything seems good.

This happens with:

  • node v9.3.0 (current)
  • gcc version 4.6.3 (Debian 4.6.3-14+rpi1)
  • g++ (Debian 4.6.3-14+rpi1) 4.6.3

Raspbian version:

  • Distributor ID: Debian
  • Description: Debian GNU/Linux 7.11 (wheezy)
  • Release: 7.11
  • Codename: wheezy

Tested with the previous raspbian version with:

  • node: 6.9.5, 6.11.4, 7.4.0, 8.9.3 and 9.3.0
  • gcc/g++/cc/c++: 4.6, 4.7, 4.8

I think this is a problem with Raspbian itself .

Stack trace:
capture du 2017-12-14 14 10 50

This issue is more a warning than a bug issue but if anyone has a solution other than uprading the Raspbian version it would be appreciated ! 😃

Edit 12/14/2017, 16:35:

After updating my rasp to the Stretch version:

  • install nvm
  • install node LTS and current
  • verifying latest version of gcc/g++
  • trying: npm install -g ldf-server or npm install -g hdt or npm install -g [email protected]
    Still failing on the same error.
    capture du 2017-12-14 16 34 57

Is this error a bug for anyone else ?

Unable to build on Windows

Hi, I having a problem with building this package on Windows like system. Though I am not an expert, I guess it's because of using bash commands in binding.gyp.
To be more precise - error from console:

'ls' is not recognized as an internal or external command

I was able to build it in Docker (node:latest), which I am using now as a dev enviroment, but at first I spent some time trying to figure out, what was going on.

A Limit of 0 returns all results

In C++, the negative of 0 is 1. This causes a problem with Limit set to 0 (no triples should ever be returned) on https://github.com/RubenVerborgh/HDT-Node/blob/master/lib/HdtDocument.cc#L165, where instead, the complete iterator is added to the vector. This causes the countTriples() method, which sets limit = 0, to be extremely inefficient.

The main problem here is that 'no limit' is also supplied as 0. A possible solution is to encode 'no limit' as -1 or infinity.

Concurrency protection

Doing more than one operation at the same time (because of async) can quickly blow up the memory.
For instance, when doing a series of lookups in a for loop with searchTriples, this can more or less only be throttled using await.

Could we have a more graceful handling/exit, probably by:

  • queuing operations so only a limited number (one?) of CPP operations can be active simuntaneously is active;
  • reject promises when the queue is too large;
  • something else?

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.