rare-technologies / smart_open Goto Github PK

Utils for streaming large files (S3, HDFS, gzip, bz2...)

License: MIT License

Python 98.80% Shell 1.20%

python s3 hdfs webhdfs boto streaming file streaming-data gzip-stream bz2 hacktoberfest

smart_open's Issues

Support buckets with "." in the name

It's not exactly related to smart_open but it currently suffers from this issue as well, opening any key dots in the bucket name generate this error below:

    smart_open.smart_open('s3://bucket.name.with.dots/full/path/to/key')

    /home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in smart_open(uri, mode, **kw)
        120         elif parsed_uri.scheme in ("s3", "s3n"):
        121             s3_connection = boto.connect_s3(aws_access_key_id=parsed_uri.access_id, aws_secret_access_key=parsed_uri.access_secret)
    --> 122             bucket = s3_connection.get_bucket(parsed_uri.bucket_id)
        123             if mode in ('r', 'rb'):
        124                 key = bucket.get_key(parsed_uri.key_id)

    /home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/s3/connection.pyc in get_bucket(self, bucket_name, validate, headers)
        500         """
        501         if validate:
    --> 502             return self.head_bucket(bucket_name, headers=headers)
        503         else:
        504             return self.bucket_class(self, bucket_name)

    /home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/s3/connection.pyc in head_bucket(self, bucket_name, headers)
        519         :returns: A <Bucket> object
        520         """
    --> 521         response = self.make_request('HEAD', bucket_name, headers=headers)
        522         body = response.read()
        523         if response.status == 200:

    /home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/s3/connection.pyc in make_request(self, method, bucket, key, headers, data, query_args, sender, override_num_retries, retry_handler)
        662             data, host, auth_path, sender,
        663             override_num_retries=override_num_retries,
    --> 664             retry_handler=retry_handler
        665         )

    /home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/connection.pyc in make_request(self, method, path, headers, data, host, auth_path, sender, override_num_retries, params, retry_handler)
       1069                                                     params, headers, data, host)
       1070         return self._mexe(http_request, sender, override_num_retries,
    -> 1071                           retry_handler=retry_handler)
       1072 
       1073     def close(self):

    /home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/connection.pyc in _mexe(self, request, sender, override_num_retries, retry_handler)
        941                 else:
        942                     connection.request(request.method, request.path,
    --> 943                                        request.body, request.headers)
        944                     response = connection.getresponse()
        945                 boto.log.debug('Response headers: %s' % response.getheaders())

    /usr/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
       1050     def request(self, method, url, body=None, headers={}):
       1051         """Send a complete request to the server."""
    -> 1052         self._send_request(method, url, body, headers)
       1053 
       1054     def _set_content_length(self, body, method):

    /usr/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
       1090         for hdr, value in headers.iteritems():
       1091             self.putheader(hdr, value)
    -> 1092         self.endheaders(body)
       1093 
       1094     def getresponse(self, buffering=False):

    /usr/lib/python2.7/httplib.pyc in endheaders(self, message_body)
       1046         else:
       1047             raise CannotSendHeader()
    -> 1048         self._send_output(message_body)
       1049 
       1050     def request(self, method, url, body=None, headers={}):

    /usr/lib/python2.7/httplib.pyc in _send_output(self, message_body)
        890             msg += message_body
        891             message_body = None
    --> 892         self.send(msg)
        893         if message_body is not None:
        894             #message_body was not a string (i.e. it is a file) and

    /usr/lib/python2.7/httplib.pyc in send(self, data)
        852         if self.sock is None:
        853             if self.auto_open:
    --> 854                 self.connect()
        855             else:
        856                 raise NotConnected()

    /usr/lib/python2.7/httplib.pyc in connect(self)
       1271 
       1272             self.sock = self._context.wrap_socket(self.sock,
    -> 1273                                                   server_hostname=server_hostname)
       1274 
       1275     __all__.append("HTTPSConnection")

    /usr/lib/python2.7/ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
        350                          suppress_ragged_eofs=suppress_ragged_eofs,
        351                          server_hostname=server_hostname,
    --> 352                          _context=self)
        353 
        354     def set_npn_protocols(self, npn_protocols):

    /usr/lib/python2.7/ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
        577                         # non-blocking
        578                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
    --> 579                     self.do_handshake()
        580 
        581             except (OSError, ValueError):

    /usr/lib/python2.7/ssl.pyc in do_handshake(self, block)
        814                 raise ValueError("check_hostname needs server_hostname "
        815                                  "argument")
    --> 816             match_hostname(self.getpeercert(), self.server_hostname)
        817 
        818     def _real_connect(self, addr, connect_ex):

    /usr/lib/python2.7/ssl.pyc in match_hostname(cert, hostname)
        269         raise CertificateError("hostname %r "
        270             "doesn't match either of %s"
    --> 271             % (hostname, ', '.join(map(repr, dnsnames))))
        272     elif len(dnsnames) == 1:
        273         raise CertificateError("hostname %r "

    CertificateError: hostname 'bucket.with.dots' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'

In our projects we worked around this just by doing this (the calling_format), I think we could the same in smart_open:

    return S3Connection(
        AWS_ACCESS_KEY,
        AWS_SECRET_ACCESS_KEY,
        validate_certs=True,
        # necessary to workaround the extra security introduced
        calling_format=OrdinaryCallingFormat(),
    )

Migrate to boto3

Check if boto3 is production-ready and migrate all code from boto to boto3.

cant use s3 bucket names with dots

 with smart_open.smart_open('s3://balihoo.fulfillment.dev/retain_7_0/test.txt', 'wb') as f:
    f.write('test')

results in:
ssl.CertificateError: hostname 'balihoo.fulfillment.dev.s3.amazonaws.com' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'

I tried to fix this with the 'host' parameter, but I can't get it to write the file.

Update license trove classifier to reflect MIT license

Hi-

In LICENSE you have a MIT license, but in the trove classifier for licenses, you have "Public Domain". In fact, those are separate licensing regimes.

Instead of License :: Public Domain in your setup.py you probably want License :: OSI Approved :: MIT License, so the metadata in/from PyPI is correct.

The boto S3 connection allows proxy, proxy_port, proxy_user and proxy_pass to be set explicitly. It would be nice to have a way to propagate those options from smart_open to the opening of the connection.

Maybe the best way to go about this is to allow the boto connection object to be passed to smart_open, instead of a new one being created each time.

Smart open does not work with S3 buckets containing dots

Smart open is not working with buckets that contains . in name such as com.ziky90.project.

Particularly I am getting an error log that looks as follows.

Traceback (most recent call last):
  File "/home/ubuntu/bucket_accessor/model_loader.py", line 70, in <module>
    model = TModel.load(model_path)
  File "/usr/local/lib/python2.7/dist-packages/gensim/utils.py", line 252, in load
    obj = unpickle(fname)
  File "/usr/local/lib/python2.7/dist-packages/gensim/utils.py", line 906, in unpickle
    with smart_open(fname) as f:
  File "/usr/local/lib/python2.7/dist-packages/smart_open/smart_open_lib.py", line 93, in smart_open
    return S3OpenRead(parsed_uri)
  File "/usr/local/lib/python2.7/dist-packages/smart_open/smart_open_lib.py", line 191, in __init__
    self.read_key = s3_connection.get_bucket(parsed_uri.bucket_id).lookup(parsed_uri.key_id)
  File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 502, in get_bucket
    return self.head_bucket(bucket_name, headers=headers)
  File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 521, in head_bucket
    response = self.make_request('HEAD', bucket_name, headers=headers)
  File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 664, in make_request
    retry_handler=retry_handler
  File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 1071, in make_request
    retry_handler=retry_handler)
  File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 943, in _mexe
    request.body, request.headers)
  File "/usr/lib/python2.7/httplib.py", line 1001, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python2.7/httplib.py", line 1035, in _send_request
    self.endheaders(body)
  File "/usr/lib/python2.7/httplib.py", line 997, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python2.7/httplib.py", line 850, in _send_output
    self.send(msg)
  File "/usr/lib/python2.7/httplib.py", line 812, in send
    self.connect()
  File "/usr/lib/python2.7/httplib.py", line 1212, in connect
    server_hostname=server_hostname)
  File "/usr/lib/python2.7/ssl.py", line 350, in wrap_socket
    _context=self)
  File "/usr/lib/python2.7/ssl.py", line 566, in __init__
    self.do_handshake()
  File "/usr/lib/python2.7/ssl.py", line 796, in do_handshake
    match_hostname(self.getpeercert(), self.server_hostname)
  File "/usr/lib/python2.7/ssl.py", line 269, in match_hostname
    % (hostname, ', '.join(map(repr, dnsnames))))
CertificateError: hostname 'com.ziky90.project.s3.amazonaws.com' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'

NOTE: (I use smart_open from gensim)

It seems to me that the bug is related to boto/boto#2836 and I am wondering if for example replacing boto by boto3 would help?

offer 'parents' option: make containing directories if necessary/possible

It's be nice if there were a smart_open(..., parents=True) parameter, which (for local files) would create parent directories, if necessary – much like mkdir --parents.

Can no longer write gzipped files.

This new check has removed the ability to write gzipped files to S3.

It looks like native gzipping is being added to smart_open and that's why this check was put in place. However, until the new write functionality is added this check should be removed in order to allow users to write their own compressed stream.

Cannot install if `LC_ALL=C`

When the system environment variable LC_ALL=C I cannot install smart_open. The problem is in the dependency httpretty, since setup.py requires the version httpretty==0.8.6 which is know not to work with LC_ALL=C. The error I get is this:

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 133: ordinal not in range(128)

httpretty fixed this error in version 0.8.8, so I am wondering if it would be possible to to relax the requirement to httpretty>=0.8.6?

I actually discovered this when trying to install gensim, which also did not work since it requires smart_open.

don't pin requests to a specific broken version

Was there a reason to pin requests==2.8.1 instead of the much friendlier "<=2.8.1"?

That version of requests pulls in a broken version of urllib3 that cannot do SSL properly, so we have had to pin "requests<=2.7.0". Since you pinned to that specific version of requests, I have to also say:

    'gensim <= 0.12.1', # so it doesn't pull in smart-open newest that pins requests to 2.8.1
    'smart-open < 1.3.1', # yuck.
    'requests', # <= 2.7.0', # pinned until this is released: https://github.com/shazow/urllib3/issues/717

If you used "requests<=2.8.1" then everything would work fine... unless you have some need of 2.8.1 specifically in smart-open?

Streaming gzipped files on S3

Smart_open works very well for local gzipped fields, and for files on S3, is it not possible to stream gzipped files on S3?

I tried and it doesn't seem to decompress while streaming, is there a reason for it not to work or is it just not implemented?
thanks

Not working on app engine

When trying to use smart_open in App Engine, i get the following error:

from subprocess import _args_from_interpreter_flags
ImportError: cannot import name _args_from_interpreter_flags

Loosely translated through google, this reads as there is no multiprocessing on app engine, see

Any ideas how to tackle this? App Engine is very interesting platform for smart_open due to its lack of local disk access and limited memory.

git tags for versions/releases

Could you add git tags for the releases?

s3 seek

If I try to use smart open to seek/read parts of an s3 file, I get NotImplementedError: seek other than offset=0 not implemented yet.

Arbitrary seeking, especially when the seek was specified relative to the beginning of the file (seek(..., whence=0), should be possible through the Range HTTP header

>>> import boto
>>> s3 = boto.connect_s3()
>>> bucket = s3.lookup('bucket')
>>> key = bucket.lookup('key')
>>> parts = key.get_contents_as_string(headers={'Range' : 'bytes=12-24'})

seek could establish a pointer to the starting byte and subsequent reads would define the end.

Are there any technical limitation or design restrictions that would prevent this?

Passing File Object Not Filename To GzipFile and BZ2File

Here

https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L626

and here

https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L630

Should be passing in the filename not the file object.

smart_open in other languages

I have a few questions:

Is "smart open" (or something similar) available in other languages? (like maybe GO, or java)
Does the gensim library from same group use this same tool/ approach?
I see running things like streaming in data from s3 is very CPU( and IO) intensive and not much memory intensive. Would multi-threading approach a good way to parallelize reading many and huge files(logs) from s3? Or would it be worthless?, let's say if I'm parsing the json and then writing to a relational data warehouse; since the write on the db would be single threaded?

Code snippets of smart_open improvements

@ziky90 What are examples when smart_open comes in handy for HDFS and WebHDFS?

Someone invited me to talk about smart_open for 20 minutes. I have created the code comparisons for S3 below. Would like to expand them to other filesystems. What pains had smart_open solved for you in HDFS or WebHDFS?

S3 identifier with `@` in key raises RuntimeError

Trying to open a file on S3 with a @ in the filename or prefix raises a RuntimeError

uri = 's3://bucketname/docs/[email protected]/test.pdf'
f = smart_open(uri, 'rb')

RuntimeError                              Traceback (most recent call last)
<ipython-input-60-6156dc49e12c> in <module>()
----> 1 f = smart_open(uri, 'rb')

/data/environs/nlp/local/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in smart_open(uri, mode, **kw)
    120         # this method just routes the request to classes handling the specific storage
    121         # schemes, depending on the URI protocol in `uri`
--> 122         parsed_uri = ParseUri(uri)
    123 
    124         if parsed_uri.scheme in ("file", ):

/data/environs/nlp/local/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in __init__(self, uri, default_scheme)
    251                 # Bucket names can contain lowercase letters, numbers, and hyphens.
    252                 # Each label must start and end with a lowercase letter or a number.
--> 253                 raise RuntimeError("invalid S3 URI: %s" % uri)
    254         elif self.scheme == 'file':
    255             self.uri_path = parsed_uri.netloc + parsed_uri.path

RuntimeError: invalid S3 URI: s3://bucketname/docs/[email protected]/test.pdf

Add tests for #39

Missing tests for the disable multiprocessing functionality. How to create them is discussed in #39

For s3 file, readline doesn't work after seek(0)

If we do .seek(0) on an open s3 file instance and calls the readline on it after, it just keeps on giving the empty string '' back and also the iterator is consumed at that point of time. Please find the code snipped link below:
https://dpaste.de/VXTi

EDIT: Updated with code snippet link to dpaste

S3 does not work with AWS Instance Roles

Since this uses an old version of boto, it will not load instance role credentials, making it unusable in a typical AWS production environment,

Thank you!

I just wanted to take a moment here and thank all the people who spent the time and made Smart Open possible.

Guys, you rock!

Getting last modified time from s3_iter_bucket

Is it possible to get the last_modified time of the file when using the s3_iter_bucket function?
If not, would you consider adding a flag so that the modified time is returned as part of the tuple?
Thanks

S3OpenRead doesn't work with TextIOWrapper

Currently only binary-mode read is supported, which would be OK if the stream you returned worked with TextIOWrapper -- but it doesn't, at least not S3OpenRead. TextIOWrapper expects readable, writable, seekable, closed etc methods. If you make S3OpenRead inherit (or quack like) IOBase that should fix it.

'bz2file' not being recognized from PYTHONPATH

Facing the following error while installing smart_open 1.3.4 from source.

On extracting the bz2file's source (tar.gz) and placing the folder's path in PYTHONPATH, I expected bz2file to be visible to other python scripts. Using the version bz2file-0.98 with Python 2.6.6.

PS : This machine has no connectivity to internet, hence the download errors.

>>  python libs/smart_open-1.3.4/setup.py test
running test
Checking .pth file support in .
/usr/bin/python -E -c pass
Searching for bz2file
Reading http://pypi.python.org/simple/bz2file/
Download error: [Errno 101] Network is unreachable -- Some packages may not be found!
Reading http://pypi.python.org/simple/bz2file/
Download error: [Errno 101] Network is unreachable -- Some packages may not be found!
Couldn't find index page for 'bz2file' (maybe misspelled?)
Scanning index of all packages (this may take a while)
Reading http://pypi.python.org/simple/
Download error: [Errno 101] Network is unreachable -- Some packages may not be found!
No local packages or download links found for bz2file
error: Could not find suitable distribution for Requirement.parse('bz2file')

error: Error -3 while decompressing: incorrect header check

I was trying to read a compressed file from s3 on ec2 and it did not print any line

for myKey,content in smart_open.s3_iter_bucket(email_bucket):
    with smart_open.smart_open(myKey) as data:
        for line in data:
            print line

Then I ran:

file = smart_open.smart_open(myKey)
for line in data:
    print line

and got the error:

I think this is caused by zlib. Any insight on how to troubleshoot this?

ConnectionResetError: [Errno 104] Connection reset by peer

I am receiving an error each time when I try to upload a large stream (not just file):

[2016-08-01 20:13:48,253: ERROR/Worker-4] encountered error while terminating multipart upload; attempting cancel
Traceback (most recent call last):
  File "/opt/mypath/myapp/apps/lib/awsutils/s3stream.py", line 21, in bucket_put_stream
    fout.write(buffer)
  File "/opt/mypath/venv/lib/python3.4/site-packages/smart_open/smart_open_lib.py", line 487, in write
    self.mp.upload_part_from_file(BytesIO(buff), part_num=self.parts + 1)
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/multipart.py", line 260, in upload_part_from_file
    query_args=query_args, size=size)
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 1293, in set_contents_from_file
    chunked_transfer=chunked_transfer, size=size)
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 750, in send_file
    chunked_transfer=chunked_transfer, size=size)
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 951, in _send_file_internal
    query_args=query_args
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/connection.py", line 668, in make_request
    retry_handler=retry_handler
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/connection.py", line 1071, in make_request
    retry_handler=retry_handler)
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/connection.py", line 1030, in _mexe
    raise ex
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/connection.py", line 940, in _mexe
    request.body, request.headers)
  File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 844, in sender
    http_conn.send(chunk)
  File "/usr/lib/python3.4/http/client.py", line 888, in send
    self.sock.sendall(data)
  File "/usr/lib/python3.4/ssl.py", line 741, in sendall
    v = self.send(data[count:])
  File "/usr/lib/python3.4/ssl.py", line 702, in send
    v = self._sslobj.write(data)
ConnectionResetError: [Errno 104] Connection reset by peer

The error is thrown by boto, so the issue might be related to boto/boto#2207 . I tried all workarounds, but no luck.

My code looks like this:

def bucket_put_stream(stream, bucket, key):
    awskey = AWS_SETTINGS['AWS']['ACCESS_KEY_ID']
    awssecret = AWS_SETTINGS['AWS']['SECRET_ACCESS_KEY']
    size = 0
    with smart_open.smart_open('s3://%s:%s@%s/%s' % (awskey, awssecret, bucket, key), 'wb') as fout:
        for data in stream:
            fout.write(data)
            size += len(data)
    return size

Small file of 17Kb was uploaded without issues, so the problem shouldn't be in permissions.

Cached reads

The fact that smart_open is streaming is great and nice to use.

However sometimes we still need to get the whole file.

Would there be a way to actually cache the file content, to make sure that we don't actually re-read the whole file every time?

S3 files for example in our case never change, passing around the smart_open read object they would be re-read completely every time right?

Something like

smart_open.smart_open('s3://bucket/key', cached=True)

for example could be a possible API for this?
Do you think it's feasible/desirable?

Add support for scp(ssh) protocol

This would be pretty useful to copy from generic shell accounts to S3 and vice versa.

Support of WebHDFS

My idea is to add possibility to read from WebHDFS.
It would probably need to be implemented as some wrapper on top of WebHDFS API.
https://hadoop.apache.org/docs/r1.0.4/webhdfs.html

Based on the observations from API:

Streaming seems to be possible
It will involve probably requests library dependency (is this possible?)
It might involve necessity of creating some config file with details about login to WebHDFS

Would this addition be appreciated in smart_open? I can try to work on PR, hopefully in the near future.

Opening files with `#` character

Creating a file with filename containing # char (e.g. aa#aa) and trying to open it with smart_open.smart_open('aa#aa') fails.

In [3]: open('aa#aa')
Out[3]: <open file 'aa#aa', mode 'r' at 0x7f4ec5b72ed0>

In [2]: smart_open('aa#aa')
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-2-e0a7775bdb92> in <module>()
----> 1 s('aa#aa')

/usr/local/lib/python2.7/dist-packages/smart_open-1.2.1-py2.7.egg/smart_open/smart_open_lib.pyc in smart_open(uri, mode)
    87         # local files -- both read & write supported
    88         # compression, if any, is determined by the filename extension (.gz, .bz2)
---> 89         return file_smart_open(parsed_uri.uri_path, mode)
    90
    91     if mode in ('r', 'rb'):

/usr/local/lib/python2.7/dist-packages/smart_open-1.2.1-py2.7.egg/smart_open/smart_open_lib.pyc in file_smart_open(fname, mode)
   299         return make_closing(GzipFile)(fname, mode)
   300
--> 301     return open(fname, mode)
   302
   303

IOError: [Errno 2] No such file or directory: 'aa'

S3OpenRead ignores the mode of `'r'` opens always in `'rb'`

The S3OpenRead always returns data as bytes in Python 3 which breaks things expecting str. The S3OpenRead needs to respect 'r' instead of returning everything as bytes.

Not able to run tests

For some weird reason I can't seem to run tests with "python setup.py test", the error here below.
I'm getting this both with Python 2.7.10 and Python 3.4, any idea why?

    $> python setup.py test
    running test
    running egg_info
    writing requirements to smart_open.egg-info/requires.txt
    writing smart_open.egg-info/PKG-INFO
    writing top-level names to smart_open.egg-info/top_level.txt
    writing dependency_links to smart_open.egg-info/dependency_links.txt
    reading manifest file 'smart_open.egg-info/SOURCES.txt'
    writing manifest file 'smart_open.egg-info/SOURCES.txt'
    running build_ext
    Traceback (most recent call last):
      File "setup.py", line 66, in <module>
        'Topic :: Database :: Front-Ends',
      File "/usr/lib/python2.7/distutils/core.py", line 151, in setup
        dist.run_commands()
      File "/usr/lib/python2.7/distutils/dist.py", line 953, in run_commands
        self.run_command(cmd)
      File "/usr/lib/python2.7/distutils/dist.py", line 972, in run_command
        cmd_obj.run()
      File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 142, in run
        self.with_project_on_sys_path(self.run_tests)
      File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 122, in with_project_on_sys_path
        func()
      File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 163, in run_tests
        testRunner=self._resolve_as_ep(self.test_runner),
      File "/usr/lib/python2.7/unittest/main.py", line 94, in __init__
        self.parseArgs(argv)
      File "/usr/lib/python2.7/unittest/main.py", line 149, in parseArgs
        self.createTests()
      File "/usr/lib/python2.7/unittest/main.py", line 158, in createTests
        self.module)
      File "/usr/lib/python2.7/unittest/loader.py", line 130, in loadTestsFromNames
        suites = [self.loadTestsFromName(name, module) for name in names]
      File "/usr/lib/python2.7/unittest/loader.py", line 103, in loadTestsFromName
        return self.loadTestsFromModule(obj)
      File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 37, in loadTestsFromModule
        tests.append(self.loadTestsFromName(submodule))
      File "/usr/lib/python2.7/unittest/loader.py", line 100, in loadTestsFromName
        parent, obj = obj, getattr(obj, part)
    AttributeError: 'module' object has no attribute 'test_smart_open'

If I use pytest even if I have installed all the dependencies I get

________________________________ ERROR collecting smart_open/tests/test_smart_open.py _________________________________
../../.local/lib/python2.7/site-packages/py/_path/local.py:650: in pyimport
    __import__(modname)
smart_open/__init__.py:1: in <module>
    from .smart_open_lib import *
smart_open/smart_open_lib.py:34: in <module>
    from boto.compat import BytesIO, urlsplit, six
E   ImportError: No module named boto.compat
=============================================== 1 error in 0.09 seconds =

Get rid of file mode restrictions

Currently smart_open is limiting file modes to ('r', 'rb', 'w', 'wb') here.

I don't think there is any reason for this -- remove the restriction and allow whatever mode is supported by the underlying storage (such as wb+).

S3 file like objects - can they implement a flush method?

It would be useful if S3 file-like objects also implemented the "flush" method (which is available on local file like objects)

This would make it easier to use as a drop-in replacement for file-like objects.

This could be a no-op or could genuinely force a flush of data.

What do you think?

Webhdfs uri query is discarded

Webhdfs allows passing additional parameters such as user.name through query part of the uri, like so:

http://HOST:PORT/webhdfs/v1/PATH?[user.name=USER]

At the moment the library simply never uses the query part, making it impossible to use those options when accessing files.

Bug (maybe feature) in HdfsOpenRead

On line 249 I have noticed a minor bug in reading from HDFS

There is used command hadoop fs -cat which based on the documentation http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/FileSystemShell.html reads any file mounted to Hadoop (HDFS, FS, HFTP FS, S3 FS, and others)

Correctly to meet what smart_open claims to do there should be used hdfs dfs -cat.
http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html#dfs

I can do a PR with fix if this is really a bug.

If this is expected behaviour then I would change the documentation + I would not limit files schema to only HDFS.

Gzipped Files Over HTTPS Lacking the tell()

Doing a smart_open('https://url/file.xml.gz') uses passes the ReadStream to the GzipFile during the make_compression

The file object passed here is the HttpReadStream
https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L632

The read stream doesn't have the tell() method that the gzip library expects.

Gzipped URLS with parameters don't match the .endswith()

I have a pre-signed S3 URL for a gzipped xml file.

https://bucket.s3.amazonaws.com/file.xml.gz?AWSAccessKeyId=asdf&Expires=2493144554&Signature=qasdf

The if fname.endswith('.gz') won't evaluate to True because of the URL parameters.

https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L759

I suggest removing all query parameters when finding the file name.

Support for reading and writing files directly to/from ftp

Another feature that I would like to have in smart_open is support for direct reading/writing of the data to/from ftp.

Anyone else interested in this? Does this make sense?

Consider using bz2file for python < 3 to support multistreams bz2

Large bz2 files are often processed with pbzip2 and end up being multistream files. Python 2.7 does not handle them yet (Py 3 does). bz2file is a backport of that (by the author of the feature) to py 2.7
https://github.com/nvawda/bz2file
https://pypi.python.org/pypi/bz2file/0.98
I got bitten by that, so I wanted to share it...
/HTH

Travis tests sometimes hang

The Travis CI tests sometimes hang, and this may be indicative of a bug in s3_iter_bucket routine, which uses multiprocessing.

The failing tests mess up unrelated PRs (need manual test restarts, like in #61), so we want this fixed ASAP.

missing readline in S3OpenRead

Missing readline in S3OpenRead is related to and more in detail explained in piskvorky/gensim#421

Unable to read gz files on s3

It reads them, but it the data remains compressed, thus defeating line iteration.

readline() method

This would be useful for extracting the header from a csv.

urisplit doesn't work on Windows

Windows-like URIs for local filesystem, such as like c:\users\whatever trip up urlsplit() -- it detects c as the scheme.

And then smart_open fails with "unknown URI scheme" of course.

Create separate compression-specific layer to enable writing gzipped files

Implement the solution described by mpenkov in #82 (comment)

Bug in gzipstreamfile

I ran into a bug while trying to stream a fairly large line of data from a gzipped S3 file. Basically:

if len(self.unused_buffer) > size:
    return self.read_from_buffer()

is an incorrect call of read_from_buffer, which needs to have the argument size. I think the correct fix is as simple as:

if len(self.unused_buffer) > size:
    return self.read_from_buffer(size)

This seems to work in my particular case. Not 100% sure this is the generally correct solution, which if why I'm posting this issue rather than making a PR.

Writing a non-trivial size less than one chunk fails

Trying to write a non-trivial amount but less than the chunk size (say 500kb) fails with

Connection reset by peer
DEBUG:boto:establishing HTTPS connection: host=test-analytics.s3.amazonaws.com, kwargs={'port': 443, 'timeout': 70}
DEBUG:boto:Token: None
DEBUG:boto:StringToSign:
PUT

application/octet-stream
Thu, 11 Aug 2016 03:35:38 GMT
/test-analytics/mykey.txt?partNumber=1&uploadId=h_wZktx5teVtch7Ss69B0gaMDKYyHuzJ92CBGLDmNrav_AZ4uFKlJXZwI8Bpmg8t8_zLS70bJNuQGBtJj880XbbPF19od0WMuM8OBGVlrlhO9mhuYF9cmAM2253Vl9LZ
DEBUG:boto:Signature:
AWS **************************************
DEBUG:boto:Final headers: {'Content-Length': '3400000', 'Content-MD5': 'GvkoeM9sWF4kgm51pkdlQw==', 'Expect': '100-Continue', 'Date': 'Thu, 11 Aug 2016 03:35:38 GMT', 'User-Agent': 'Boto/2.42.0 Python/2.7.11+ Linux/4.4.0-31-generic', 'Content-Type': 'application/octet-stream', 'Authorization': u'************************'}
DEBUG:boto:encountered error exception, reconnecting

import logging
logging.basicConfig(level=logging.DEBUG)
import smart_open

with smart_open.smart_open('s3://my-bucket/mykey.txt', 'wb') as fout:
    for n in range(0,100000):
        for line in ['first line', 'second line', 'third line']:
            fout.write(line + '\n')

Whilst the example code works fine

import logging
logging.basicConfig(level=logging.DEBUG)
import smart_open

with smart_open.smart_open('s3://my-bucket/mykey.txt', 'wb') as fout:
    for line in ['first line', 'second line', 'third line']:
        fout.write(line + '\n')

Test compressed files over http

Add tests for smart_open("http://127.0.0.1/index.gz") because some changes in the PR #107 that introduced it were reverted in #110 and this new functionality might be no longer available. So needs to be re-implemented.

CC @robottwo

rare-technologies / smart_open Goto Github PK

smart_open's Issues

Connection reset by peer DEBUG:boto:establishing HTTPS connection: host=test-analytics.s3.amazonaws.com, kwargs={'port': 443, 'timeout': 70} DEBUG:boto:Token: None DEBUG:boto:StringToSign: PUT

Recommend Projects

Recommend Topics

Recommend Org

Connection reset by peer
DEBUG:boto:establishing HTTPS connection: host=test-analytics.s3.amazonaws.com, kwargs={'port': 443, 'timeout': 70}
DEBUG:boto:Token: None
DEBUG:boto:StringToSign:
PUT