rare-technologies / smart_open Goto Github PK
View Code? Open in Web Editor NEWUtils for streaming large files (S3, HDFS, gzip, bz2...)
License: MIT License
Utils for streaming large files (S3, HDFS, gzip, bz2...)
License: MIT License
It's not exactly related to smart_open but it currently suffers from this issue as well, opening any key dots in the bucket name generate this error below:
smart_open.smart_open('s3://bucket.name.with.dots/full/path/to/key')
/home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in smart_open(uri, mode, **kw)
120 elif parsed_uri.scheme in ("s3", "s3n"):
121 s3_connection = boto.connect_s3(aws_access_key_id=parsed_uri.access_id, aws_secret_access_key=parsed_uri.access_secret)
--> 122 bucket = s3_connection.get_bucket(parsed_uri.bucket_id)
123 if mode in ('r', 'rb'):
124 key = bucket.get_key(parsed_uri.key_id)
/home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/s3/connection.pyc in get_bucket(self, bucket_name, validate, headers)
500 """
501 if validate:
--> 502 return self.head_bucket(bucket_name, headers=headers)
503 else:
504 return self.bucket_class(self, bucket_name)
/home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/s3/connection.pyc in head_bucket(self, bucket_name, headers)
519 :returns: A <Bucket> object
520 """
--> 521 response = self.make_request('HEAD', bucket_name, headers=headers)
522 body = response.read()
523 if response.status == 200:
/home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/s3/connection.pyc in make_request(self, method, bucket, key, headers, data, query_args, sender, override_num_retries, retry_handler)
662 data, host, auth_path, sender,
663 override_num_retries=override_num_retries,
--> 664 retry_handler=retry_handler
665 )
/home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/connection.pyc in make_request(self, method, path, headers, data, host, auth_path, sender, override_num_retries, params, retry_handler)
1069 params, headers, data, host)
1070 return self._mexe(http_request, sender, override_num_retries,
-> 1071 retry_handler=retry_handler)
1072
1073 def close(self):
/home/andrea/.virtualenvs/cookie-sync-processing/lib/python2.7/site-packages/boto/connection.pyc in _mexe(self, request, sender, override_num_retries, retry_handler)
941 else:
942 connection.request(request.method, request.path,
--> 943 request.body, request.headers)
944 response = connection.getresponse()
945 boto.log.debug('Response headers: %s' % response.getheaders())
/usr/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1050 def request(self, method, url, body=None, headers={}):
1051 """Send a complete request to the server."""
-> 1052 self._send_request(method, url, body, headers)
1053
1054 def _set_content_length(self, body, method):
/usr/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1090 for hdr, value in headers.iteritems():
1091 self.putheader(hdr, value)
-> 1092 self.endheaders(body)
1093
1094 def getresponse(self, buffering=False):
/usr/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1046 else:
1047 raise CannotSendHeader()
-> 1048 self._send_output(message_body)
1049
1050 def request(self, method, url, body=None, headers={}):
/usr/lib/python2.7/httplib.pyc in _send_output(self, message_body)
890 msg += message_body
891 message_body = None
--> 892 self.send(msg)
893 if message_body is not None:
894 #message_body was not a string (i.e. it is a file) and
/usr/lib/python2.7/httplib.pyc in send(self, data)
852 if self.sock is None:
853 if self.auto_open:
--> 854 self.connect()
855 else:
856 raise NotConnected()
/usr/lib/python2.7/httplib.pyc in connect(self)
1271
1272 self.sock = self._context.wrap_socket(self.sock,
-> 1273 server_hostname=server_hostname)
1274
1275 __all__.append("HTTPSConnection")
/usr/lib/python2.7/ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
350 suppress_ragged_eofs=suppress_ragged_eofs,
351 server_hostname=server_hostname,
--> 352 _context=self)
353
354 def set_npn_protocols(self, npn_protocols):
/usr/lib/python2.7/ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
577 # non-blocking
578 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 579 self.do_handshake()
580
581 except (OSError, ValueError):
/usr/lib/python2.7/ssl.pyc in do_handshake(self, block)
814 raise ValueError("check_hostname needs server_hostname "
815 "argument")
--> 816 match_hostname(self.getpeercert(), self.server_hostname)
817
818 def _real_connect(self, addr, connect_ex):
/usr/lib/python2.7/ssl.pyc in match_hostname(cert, hostname)
269 raise CertificateError("hostname %r "
270 "doesn't match either of %s"
--> 271 % (hostname, ', '.join(map(repr, dnsnames))))
272 elif len(dnsnames) == 1:
273 raise CertificateError("hostname %r "
CertificateError: hostname 'bucket.with.dots' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'
In our projects we worked around this just by doing this (the calling_format), I think we could the same in smart_open:
return S3Connection(
AWS_ACCESS_KEY,
AWS_SECRET_ACCESS_KEY,
validate_certs=True,
# necessary to workaround the extra security introduced
calling_format=OrdinaryCallingFormat(),
)
Check if boto3 is production-ready and migrate all code from boto to boto3.
with smart_open.smart_open('s3://balihoo.fulfillment.dev/retain_7_0/test.txt', 'wb') as f:
f.write('test')
results in:
ssl.CertificateError: hostname 'balihoo.fulfillment.dev.s3.amazonaws.com' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'
I tried to fix this with the 'host' parameter, but I can't get it to write the file.
Hi-
In LICENSE you have a MIT license, but in the trove classifier for licenses, you have "Public Domain". In fact, those are separate licensing regimes.
Instead of License :: Public Domain
in your setup.py you probably want License :: OSI Approved :: MIT License
, so the metadata in/from PyPI is correct.
The boto S3 connection allows proxy
, proxy_port
, proxy_user
and proxy_pass
to be set explicitly. It would be nice to have a way to propagate those options from smart_open
to the opening of the connection.
Maybe the best way to go about this is to allow the boto connection object to be passed to smart_open
, instead of a new one being created each time.
Smart open is not working with buckets that contains .
in name such as com.ziky90.project
.
Particularly I am getting an error log that looks as follows.
Traceback (most recent call last):
File "/home/ubuntu/bucket_accessor/model_loader.py", line 70, in <module>
model = TModel.load(model_path)
File "/usr/local/lib/python2.7/dist-packages/gensim/utils.py", line 252, in load
obj = unpickle(fname)
File "/usr/local/lib/python2.7/dist-packages/gensim/utils.py", line 906, in unpickle
with smart_open(fname) as f:
File "/usr/local/lib/python2.7/dist-packages/smart_open/smart_open_lib.py", line 93, in smart_open
return S3OpenRead(parsed_uri)
File "/usr/local/lib/python2.7/dist-packages/smart_open/smart_open_lib.py", line 191, in __init__
self.read_key = s3_connection.get_bucket(parsed_uri.bucket_id).lookup(parsed_uri.key_id)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 502, in get_bucket
return self.head_bucket(bucket_name, headers=headers)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 521, in head_bucket
response = self.make_request('HEAD', bucket_name, headers=headers)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 664, in make_request
retry_handler=retry_handler
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 1071, in make_request
retry_handler=retry_handler)
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 943, in _mexe
request.body, request.headers)
File "/usr/lib/python2.7/httplib.py", line 1001, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 1035, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 997, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 850, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 812, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 1212, in connect
server_hostname=server_hostname)
File "/usr/lib/python2.7/ssl.py", line 350, in wrap_socket
_context=self)
File "/usr/lib/python2.7/ssl.py", line 566, in __init__
self.do_handshake()
File "/usr/lib/python2.7/ssl.py", line 796, in do_handshake
match_hostname(self.getpeercert(), self.server_hostname)
File "/usr/lib/python2.7/ssl.py", line 269, in match_hostname
% (hostname, ', '.join(map(repr, dnsnames))))
CertificateError: hostname 'com.ziky90.project.s3.amazonaws.com' doesn't match either of '*.s3.amazonaws.com', 's3.amazonaws.com'
NOTE: (I use smart_open from gensim)
It seems to me that the bug is related to boto/boto#2836 and I am wondering if for example replacing boto by boto3 would help?
It's be nice if there were a smart_open(..., parents=True)
parameter, which (for local files) would create parent directories, if necessary โ much like mkdir --parents
.
This new check has removed the ability to write gzipped files to S3.
It looks like native gzipping is being added to smart_open and that's why this check was put in place. However, until the new write functionality is added this check should be removed in order to allow users to write their own compressed stream.
When the system environment variable LC_ALL=C
I cannot install smart_open
. The problem is in the dependency httpretty
, since setup.py requires the version httpretty==0.8.6
which is know not to work with LC_ALL=C
. The error I get is this:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 133: ordinal not in range(128)
httpretty
fixed this error in version 0.8.8
, so I am wondering if it would be possible to to relax the requirement to httpretty>=0.8.6
?
I actually discovered this when trying to install gensim
, which also did not work since it requires smart_open
.
Was there a reason to pin requests==2.8.1 instead of the much friendlier "<=2.8.1"?
That version of requests pulls in a broken version of urllib3 that cannot do SSL properly, so we have had to pin "requests<=2.7.0". Since you pinned to that specific version of requests, I have to also say:
'gensim <= 0.12.1', # so it doesn't pull in smart-open newest that pins requests to 2.8.1
'smart-open < 1.3.1', # yuck.
'requests', # <= 2.7.0', # pinned until this is released: https://github.com/shazow/urllib3/issues/717
If you used "requests<=2.8.1" then everything would work fine... unless you have some need of 2.8.1 specifically in smart-open?
Smart_open works very well for local gzipped fields, and for files on S3, is it not possible to stream gzipped files on S3?
I tried and it doesn't seem to decompress while streaming, is there a reason for it not to work or is it just not implemented?
thanks
When trying to use smart_open in App Engine, i get the following error:
from subprocess import _args_from_interpreter_flags
ImportError: cannot import name _args_from_interpreter_flags
Loosely translated through google, this reads as there is no multiprocessing on app engine, see
Any ideas how to tackle this? App Engine is very interesting platform for smart_open due to its lack of local disk access and limited memory.
Could you add git tags for the releases?
If I try to use smart open to seek/read parts of an s3 file, I get NotImplementedError: seek other than offset=0 not implemented yet
.
Arbitrary seeking, especially when the seek was specified relative to the beginning of the file (seek(..., whence=0)
, should be possible through the Range
HTTP header
>>> import boto
>>> s3 = boto.connect_s3()
>>> bucket = s3.lookup('bucket')
>>> key = bucket.lookup('key')
>>> parts = key.get_contents_as_string(headers={'Range' : 'bytes=12-24'})
seek
could establish a pointer to the starting byte and subsequent read
s would define the end.
Are there any technical limitation or design restrictions that would prevent this?
Here
https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L626
and here
https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L630
Should be passing in the filename not the file object.
I have a few questions:
@ziky90 What are examples when smart_open comes in handy for HDFS and WebHDFS?
Someone invited me to talk about smart_open for 20 minutes. I have created the code comparisons for S3 below. Would like to expand them to other filesystems. What pains had smart_open solved for you in HDFS or WebHDFS?
Trying to open a file on S3 with a @
in the filename or prefix raises a RuntimeError
uri = 's3://bucketname/docs/[email protected]/test.pdf'
f = smart_open(uri, 'rb')
RuntimeError Traceback (most recent call last)
<ipython-input-60-6156dc49e12c> in <module>()
----> 1 f = smart_open(uri, 'rb')
/data/environs/nlp/local/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in smart_open(uri, mode, **kw)
120 # this method just routes the request to classes handling the specific storage
121 # schemes, depending on the URI protocol in `uri`
--> 122 parsed_uri = ParseUri(uri)
123
124 if parsed_uri.scheme in ("file", ):
/data/environs/nlp/local/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in __init__(self, uri, default_scheme)
251 # Bucket names can contain lowercase letters, numbers, and hyphens.
252 # Each label must start and end with a lowercase letter or a number.
--> 253 raise RuntimeError("invalid S3 URI: %s" % uri)
254 elif self.scheme == 'file':
255 self.uri_path = parsed_uri.netloc + parsed_uri.path
RuntimeError: invalid S3 URI: s3://bucketname/docs/[email protected]/test.pdf
Missing tests for the disable multiprocessing functionality. How to create them is discussed in #39
If we do .seek(0) on an open s3 file instance and calls the readline on it after, it just keeps on giving the empty string '' back and also the iterator is consumed at that point of time. Please find the code snipped link below:
https://dpaste.de/VXTi
EDIT: Updated with code snippet link to dpaste
Since this uses an old version of boto, it will not load instance role credentials, making it unusable in a typical AWS production environment,
I just wanted to take a moment here and thank all the people who spent the time and made Smart Open possible.
Guys, you rock!
Is it possible to get the last_modified time of the file when using the s3_iter_bucket function?
If not, would you consider adding a flag so that the modified time is returned as part of the tuple?
Thanks
Currently only binary-mode read is supported, which would be OK if the stream you returned worked with TextIOWrapper
-- but it doesn't, at least not S3OpenRead
. TextIOWrapper
expects readable, writable, seekable, closed etc methods. If you make S3OpenRead inherit (or quack like) IOBase
that should fix it.
Facing the following error while installing smart_open 1.3.4
from source.
On extracting the bz2file's
source (tar.gz) and placing the folder's path in PYTHONPATH
, I expected bz2file
to be visible to other python scripts. Using the version bz2file-0.98
with Python 2.6.6
.
PS : This machine has no connectivity to internet, hence the download errors.
>> python libs/smart_open-1.3.4/setup.py test
running test
Checking .pth file support in .
/usr/bin/python -E -c pass
Searching for bz2file
Reading http://pypi.python.org/simple/bz2file/
Download error: [Errno 101] Network is unreachable -- Some packages may not be found!
Reading http://pypi.python.org/simple/bz2file/
Download error: [Errno 101] Network is unreachable -- Some packages may not be found!
Couldn't find index page for 'bz2file' (maybe misspelled?)
Scanning index of all packages (this may take a while)
Reading http://pypi.python.org/simple/
Download error: [Errno 101] Network is unreachable -- Some packages may not be found!
No local packages or download links found for bz2file
error: Could not find suitable distribution for Requirement.parse('bz2file')
I was trying to read a compressed file from s3 on ec2 and it did not print any line
for myKey,content in smart_open.s3_iter_bucket(email_bucket):
with smart_open.smart_open(myKey) as data:
for line in data:
print line
Then I ran:
file = smart_open.smart_open(myKey)
for line in data:
print line
I think this is caused by zlib. Any insight on how to troubleshoot this?
I am receiving an error each time when I try to upload a large stream (not just file):
[2016-08-01 20:13:48,253: ERROR/Worker-4] encountered error while terminating multipart upload; attempting cancel
Traceback (most recent call last):
File "/opt/mypath/myapp/apps/lib/awsutils/s3stream.py", line 21, in bucket_put_stream
fout.write(buffer)
File "/opt/mypath/venv/lib/python3.4/site-packages/smart_open/smart_open_lib.py", line 487, in write
self.mp.upload_part_from_file(BytesIO(buff), part_num=self.parts + 1)
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/multipart.py", line 260, in upload_part_from_file
query_args=query_args, size=size)
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 1293, in set_contents_from_file
chunked_transfer=chunked_transfer, size=size)
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 750, in send_file
chunked_transfer=chunked_transfer, size=size)
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 951, in _send_file_internal
query_args=query_args
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/connection.py", line 668, in make_request
retry_handler=retry_handler
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/connection.py", line 1071, in make_request
retry_handler=retry_handler)
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/connection.py", line 1030, in _mexe
raise ex
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/connection.py", line 940, in _mexe
request.body, request.headers)
File "/opt/mypath/venv/lib/python3.4/site-packages/boto/s3/key.py", line 844, in sender
http_conn.send(chunk)
File "/usr/lib/python3.4/http/client.py", line 888, in send
self.sock.sendall(data)
File "/usr/lib/python3.4/ssl.py", line 741, in sendall
v = self.send(data[count:])
File "/usr/lib/python3.4/ssl.py", line 702, in send
v = self._sslobj.write(data)
ConnectionResetError: [Errno 104] Connection reset by peer
The error is thrown by boto
, so the issue might be related to boto/boto#2207 . I tried all workarounds, but no luck.
My code looks like this:
def bucket_put_stream(stream, bucket, key):
awskey = AWS_SETTINGS['AWS']['ACCESS_KEY_ID']
awssecret = AWS_SETTINGS['AWS']['SECRET_ACCESS_KEY']
size = 0
with smart_open.smart_open('s3://%s:%s@%s/%s' % (awskey, awssecret, bucket, key), 'wb') as fout:
for data in stream:
fout.write(data)
size += len(data)
return size
Small file of 17Kb was uploaded without issues, so the problem shouldn't be in permissions.
The fact that smart_open is streaming is great and nice to use.
However sometimes we still need to get the whole file.
Would there be a way to actually cache the file content, to make sure that we don't actually re-read the whole file every time?
S3 files for example in our case never change, passing around the smart_open read object they would be re-read completely every time right?
Something like
smart_open.smart_open('s3://bucket/key', cached=True)
for example could be a possible API for this?
Do you think it's feasible/desirable?
This would be pretty useful to copy from generic shell accounts to S3 and vice versa.
My idea is to add possibility to read from WebHDFS.
It would probably need to be implemented as some wrapper on top of WebHDFS API.
https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
Based on the observations from API:
Would this addition be appreciated in smart_open? I can try to work on PR, hopefully in the near future.
Creating a file with filename containing #
char (e.g. aa#aa
) and trying to open it with smart_open.smart_open('aa#aa')
fails.
In [3]: open('aa#aa')
Out[3]: <open file 'aa#aa', mode 'r' at 0x7f4ec5b72ed0>
vs
In [2]: smart_open('aa#aa')
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
<ipython-input-2-e0a7775bdb92> in <module>()
----> 1 s('aa#aa')
/usr/local/lib/python2.7/dist-packages/smart_open-1.2.1-py2.7.egg/smart_open/smart_open_lib.pyc in smart_open(uri, mode)
87 # local files -- both read & write supported
88 # compression, if any, is determined by the filename extension (.gz, .bz2)
---> 89 return file_smart_open(parsed_uri.uri_path, mode)
90
91 if mode in ('r', 'rb'):
/usr/local/lib/python2.7/dist-packages/smart_open-1.2.1-py2.7.egg/smart_open/smart_open_lib.pyc in file_smart_open(fname, mode)
299 return make_closing(GzipFile)(fname, mode)
300
--> 301 return open(fname, mode)
302
303
IOError: [Errno 2] No such file or directory: 'aa'
The S3OpenRead
always returns data as bytes
in Python 3 which breaks things expecting str
. The S3OpenRead
needs to respect 'r'
instead of returning everything as bytes
.
For some weird reason I can't seem to run tests with "python setup.py test", the error here below.
I'm getting this both with Python 2.7.10 and Python 3.4, any idea why?
$> python setup.py test
running test
running egg_info
writing requirements to smart_open.egg-info/requires.txt
writing smart_open.egg-info/PKG-INFO
writing top-level names to smart_open.egg-info/top_level.txt
writing dependency_links to smart_open.egg-info/dependency_links.txt
reading manifest file 'smart_open.egg-info/SOURCES.txt'
writing manifest file 'smart_open.egg-info/SOURCES.txt'
running build_ext
Traceback (most recent call last):
File "setup.py", line 66, in <module>
'Topic :: Database :: Front-Ends',
File "/usr/lib/python2.7/distutils/core.py", line 151, in setup
dist.run_commands()
File "/usr/lib/python2.7/distutils/dist.py", line 953, in run_commands
self.run_command(cmd)
File "/usr/lib/python2.7/distutils/dist.py", line 972, in run_command
cmd_obj.run()
File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 142, in run
self.with_project_on_sys_path(self.run_tests)
File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 122, in with_project_on_sys_path
func()
File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 163, in run_tests
testRunner=self._resolve_as_ep(self.test_runner),
File "/usr/lib/python2.7/unittest/main.py", line 94, in __init__
self.parseArgs(argv)
File "/usr/lib/python2.7/unittest/main.py", line 149, in parseArgs
self.createTests()
File "/usr/lib/python2.7/unittest/main.py", line 158, in createTests
self.module)
File "/usr/lib/python2.7/unittest/loader.py", line 130, in loadTestsFromNames
suites = [self.loadTestsFromName(name, module) for name in names]
File "/usr/lib/python2.7/unittest/loader.py", line 103, in loadTestsFromName
return self.loadTestsFromModule(obj)
File "/home/andrea/.virtualenvs/smart_open/local/lib/python2.7/site-packages/setuptools/command/test.py", line 37, in loadTestsFromModule
tests.append(self.loadTestsFromName(submodule))
File "/usr/lib/python2.7/unittest/loader.py", line 100, in loadTestsFromName
parent, obj = obj, getattr(obj, part)
AttributeError: 'module' object has no attribute 'test_smart_open'
If I use pytest even if I have installed all the dependencies I get
________________________________ ERROR collecting smart_open/tests/test_smart_open.py _________________________________
../../.local/lib/python2.7/site-packages/py/_path/local.py:650: in pyimport
__import__(modname)
smart_open/__init__.py:1: in <module>
from .smart_open_lib import *
smart_open/smart_open_lib.py:34: in <module>
from boto.compat import BytesIO, urlsplit, six
E ImportError: No module named boto.compat
=============================================== 1 error in 0.09 seconds =
Currently smart_open
is limiting file modes to ('r', 'rb', 'w', 'wb')
here.
I don't think there is any reason for this -- remove the restriction and allow whatever mode is supported by the underlying storage (such as wb+
).
It would be useful if S3 file-like objects also implemented the "flush" method (which is available on local file like objects)
This would make it easier to use as a drop-in replacement for file-like objects.
This could be a no-op or could genuinely force a flush of data.
What do you think?
Webhdfs allows passing additional parameters such as user.name through query part of the uri, like so:
At the moment the library simply never uses the query part, making it impossible to use those options when accessing files.
On line 249 I have noticed a minor bug in reading from HDFS
There is used command hadoop fs -cat
which based on the documentation http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/FileSystemShell.html reads any file mounted to Hadoop (HDFS, FS, HFTP FS, S3 FS, and others)
Correctly to meet what smart_open claims to do there should be used hdfs dfs -cat
.
http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html#dfs
I can do a PR with fix if this is really a bug.
If this is expected behaviour then I would change the documentation + I would not limit files schema to only HDFS.
Doing a smart_open('https://url/file.xml.gz')
uses passes the ReadStream to the GzipFile during the make_compression
The file object passed here is the HttpReadStream
https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L632
The read stream doesn't have the tell() method that the gzip library expects.
I have a pre-signed S3 URL for a gzipped xml file.
https://bucket.s3.amazonaws.com/file.xml.gz?AWSAccessKeyId=asdf&Expires=2493144554&Signature=qasdf
The if fname.endswith('.gz') won't evaluate to True because of the URL parameters.
https://github.com/RaRe-Technologies/smart_open/blob/master/smart_open/smart_open_lib.py#L759
I suggest removing all query parameters when finding the file name.
Another feature that I would like to have in smart_open is support for direct reading/writing of the data to/from ftp.
Anyone else interested in this? Does this make sense?
Large bz2 files are often processed with pbzip2 and end up being multistream files. Python 2.7 does not handle them yet (Py 3 does). bz2file is a backport of that (by the author of the feature) to py 2.7
https://github.com/nvawda/bz2file
https://pypi.python.org/pypi/bz2file/0.98
I got bitten by that, so I wanted to share it...
/HTH
The Travis CI tests sometimes hang, and this may be indicative of a bug in s3_iter_bucket
routine, which uses multiprocessing.
The failing tests mess up unrelated PRs (need manual test restarts, like in #61), so we want this fixed ASAP.
Missing readline in S3OpenRead is related to and more in detail explained in piskvorky/gensim#421
It reads them, but it the data remains compressed, thus defeating line iteration.
This would be useful for extracting the header from a csv.
Windows-like URIs for local filesystem, such as like c:\users\whatever
trip up urlsplit()
-- it detects c
as the scheme.
And then smart_open fails with "unknown URI scheme" of course.
Implement the solution described by mpenkov in #82 (comment)
I ran into a bug while trying to stream a fairly large line of data from a gzipped S3 file. Basically:
if len(self.unused_buffer) > size:
return self.read_from_buffer()
is an incorrect call of read_from_buffer
, which needs to have the argument size
. I think the correct fix is as simple as:
if len(self.unused_buffer) > size:
return self.read_from_buffer(size)
This seems to work in my particular case. Not 100% sure this is the generally correct solution, which if why I'm posting this issue rather than making a PR.
Trying to write a non-trivial amount but less than the chunk size (say 500kb) fails with
Connection reset by peer
DEBUG:boto:establishing HTTPS connection: host=test-analytics.s3.amazonaws.com, kwargs={'port': 443, 'timeout': 70}
DEBUG:boto:Token: None
DEBUG:boto:StringToSign:
PUTapplication/octet-stream
Thu, 11 Aug 2016 03:35:38 GMT
/test-analytics/mykey.txt?partNumber=1&uploadId=h_wZktx5teVtch7Ss69B0gaMDKYyHuzJ92CBGLDmNrav_AZ4uFKlJXZwI8Bpmg8t8_zLS70bJNuQGBtJj880XbbPF19od0WMuM8OBGVlrlhO9mhuYF9cmAM2253Vl9LZ
DEBUG:boto:Signature:
AWS **************************************
DEBUG:boto:Final headers: {'Content-Length': '3400000', 'Content-MD5': 'GvkoeM9sWF4kgm51pkdlQw==', 'Expect': '100-Continue', 'Date': 'Thu, 11 Aug 2016 03:35:38 GMT', 'User-Agent': 'Boto/2.42.0 Python/2.7.11+ Linux/4.4.0-31-generic', 'Content-Type': 'application/octet-stream', 'Authorization': u'************************'}
DEBUG:boto:encountered error exception, reconnecting
import logging
logging.basicConfig(level=logging.DEBUG)
import smart_open
with smart_open.smart_open('s3://my-bucket/mykey.txt', 'wb') as fout:
for n in range(0,100000):
for line in ['first line', 'second line', 'third line']:
fout.write(line + '\n')
Whilst the example code works fine
import logging
logging.basicConfig(level=logging.DEBUG)
import smart_open
with smart_open.smart_open('s3://my-bucket/mykey.txt', 'wb') as fout:
for line in ['first line', 'second line', 'third line']:
fout.write(line + '\n')
`
A declarative, efficient, and flexible JavaScript library for building user interfaces.
๐ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. ๐๐๐
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google โค๏ธ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.