gumblex / htmllisting-parser Goto Github PK
View Code? Open in Web Editor NEWPython parser for Apache/nginx-style HTML directory listing
License: MIT License
Python parser for Apache/nginx-style HTML directory listing
License: MIT License
$ rehttpfs http://127.0.0.1:8080/ /mnt/http_mount
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.8/http/client.py", line 1322, in getresponse
response.begin()
File "/usr/lib/python3.8/http/client.py", line 303, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.8/http/client.py", line 264, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/usr/lib/python3.8/site-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3.8/site-packages/urllib3/packages/six.py", line 735, in reraise
raise value
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 423, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 330, in _raise_timeout
raise ReadTimeoutError(
urllib3.exceptions.ReadTimeoutError: HTTPConnectionPool(host='127.0.0.1', port=8080): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/fuse.py", line 734, in _wrapper
return func(*args, **kwargs) or 0
File "/usr/lib/python3.8/site-packages/fuse.py", line 995, in access
return self.operations('access', path.decode(self.encoding), amode)
File "/usr/lib/python3.8/site-packages/fuse.py", line 1251, in __call__
ret = getattr(self, op)(path, *args)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 344, in access
obj = self._getpath(path)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 290, in _getpath
return self._getdirobj(path, refresh)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 330, in _getdirobj
objmap = dirobj.read()
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 223, in read
req = SESSION.get(self.url, timeout=CONFIG['timeout'])
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 543, in get
return self.request('GET', url, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 529, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPConnectionPool(host='127.0.0.1', port=8080): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/fuse.py", line 737, in _wrapper
if e.errno > 0:
TypeError: '>' not supported between instances of 'NoneType' and 'int'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "_ctypes/callbacks.c", line 237, in 'calling callback function'
File "/usr/lib/python3.8/site-packages/fuse.py", line 756, in _wrapper
self.__critical_exception = e
NameError: name 'self' is not defined
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.8/http/client.py", line 1322, in getresponse
response.begin()
File "/usr/lib/python3.8/http/client.py", line 303, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.8/http/client.py", line 264, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/usr/lib/python3.8/site-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3.8/site-packages/urllib3/packages/six.py", line 735, in reraise
raise value
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 423, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 330, in _raise_timeout
raise ReadTimeoutError(
urllib3.exceptions.ReadTimeoutError: HTTPConnectionPool(host='127.0.0.1', port=8080): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/fuse.py", line 734, in _wrapper
return func(*args, **kwargs) or 0
File "/usr/lib/python3.8/site-packages/fuse.py", line 774, in getattr
return self.fgetattr(path, buf, None)
File "/usr/lib/python3.8/site-packages/fuse.py", line 1027, in fgetattr
attrs = self.operations('getattr', self._decode_optional_path(path), fh)
File "/usr/lib/python3.8/site-packages/fuse.py", line 1251, in __call__
ret = getattr(self, op)(path, *args)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 353, in getattr
obj = self._getpath(path)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 292, in _getpath
return self._getfileobj(path, refresh)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 312, in _getfileobj
fileobj.get_stat()
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 121, in get_stat
req = SESSION.head(self.url, timeout=CONFIG[
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 565, in head
return self.request('HEAD', url, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 529, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPConnectionPool(host='127.0.0.1', port=8080): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/fuse.py", line 737, in _wrapper
if e.errno > 0:
TypeError: '>' not supported between instances of 'NoneType' and 'int'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "_ctypes/callbacks.c", line 237, in 'calling callback function'
File "/usr/lib/python3.8/site-packages/fuse.py", line 756, in _wrapper
self.__critical_exception = e
NameError: name 'self' is not defined
fuse: bad error value: -1610617808
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.8/http/client.py", line 1322, in getresponse
response.begin()
File "/usr/lib/python3.8/http/client.py", line 303, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.8/http/client.py", line 264, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/usr/lib/python3.8/site-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3.8/site-packages/urllib3/packages/six.py", line 735, in reraise
raise value
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 423, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 330, in _raise_timeout
raise ReadTimeoutError(
urllib3.exceptions.ReadTimeoutError: HTTPConnectionPool(host='127.0.0.1', port=8080): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/fuse.py", line 734, in _wrapper
return func(*args, **kwargs) or 0
File "/usr/lib/python3.8/site-packages/fuse.py", line 774, in getattr
return self.fgetattr(path, buf, None)
File "/usr/lib/python3.8/site-packages/fuse.py", line 1027, in fgetattr
attrs = self.operations('getattr', self._decode_optional_path(path), fh)
File "/usr/lib/python3.8/site-packages/fuse.py", line 1251, in __call__
ret = getattr(self, op)(path, *args)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 353, in getattr
obj = self._getpath(path)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 292, in _getpath
return self._getfileobj(path, refresh)
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 312, in _getfileobj
fileobj.get_stat()
File "/usr/lib/python3.8/site-packages/htmllistparse/rehttpfs.py", line 121, in get_stat
req = SESSION.head(self.url, timeout=CONFIG[
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 565, in head
return self.request('HEAD', url, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 529, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPConnectionPool(host='127.0.0.1', port=8080): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/fuse.py", line 737, in _wrapper
if e.errno > 0:
TypeError: '>' not supported between instances of 'NoneType' and 'int'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "_ctypes/callbacks.c", line 237, in 'calling callback function'
File "/usr/lib/python3.8/site-packages/fuse.py", line 756, in _wrapper
self.__critical_exception = e
NameError: name 'self' is not defined
fuse: bad error value: -1527763920
Hei,
Just a comment which one might add to documentation.
I'm from Finland and reading data from US. So my local is set by Finland.
This leads to problems, because when one reads from USA
time.strptime uses Finnish words so when one receives time stamp like
"12-May-2015 17:46"
It is not recognized because the month is in english and not in finnish.
Solution: before calling htmllistparse.fetch_listing
put in your code:
locale.setlocale(
locale.LC_ALL, "C"
) # to get file time formatting right for htmllistparse
And thank's for a wondefull library!
Terveisin, Markus
The first entry in the listing is missing:
import htmllistparse
import bs4
# as served by Apache server in the wild (reports as Apache/2.4.1)
html = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /data/MYDATA</title>
</head>
<body>
<h1>Index of /data/MYDATA</h1>
<pre><img src="/icons/blank.gif" alt="Icon "> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr><img src="/icons/folder.gif" alt="[DIR]"> <a href="ancillary/">ancillary/</a> 2023-03-08 12:55 -
<img src="/icons/folder.gif" alt="[DIR]"> <a href="archive/">archive/</a> 2023-03-08 12:56 -
<img src="/icons/folder.gif" alt="[DIR]"> <a href="fltdata/">fltdata/</a> 2023-03-08 12:56 -
<img src="/icons/folder.gif" alt="[DIR]"> <a href="products/">products/</a> 2023-03-08 12:56 -
<hr></pre>
</body></html>
"""
soup = bs4.BeautifulSoup(html, 'html5lib')
cwd, listing = htmllistparse.parse(soup)
print(f"extracted {len(listing)} items:")
for f in listing:
print(f" {f.name}")
output:
extracted 3 items:
archive/
fltdata/
products/
ancillary
which is at the same line as the headers is missing
Recently I started to use your lib in my s3viewer (I've credited you in the code).
It would be nice to list in the README other open-source projects that are using your lib.
Currently, you're just publishing releases on pypi.org. It would be nice if you could use the release feature here in Github, too. This way I can easily subscribe to new releases.
I cant find any help about the use of this thing.
Any examples posted anywhere ?
I am not a python guru, i need simple examples please.
setup.py
imports htmllistparse
, which imports bs4
, but that isn't installed yet when we run setup.py.
$ pip install git+https://github.com/gumblex/htmllisting-parser.git
Collecting git+https://github.com/gumblex/htmllisting-parser.git (from -r requirements.txt (line 1))
Cloning https://github.com/gumblex/htmllisting-parser.git to /tmp/pip-spp1jrqk-build
Complete output from command python setup.py egg_info:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/tmp/pip-spp1jrqk-build/setup.py", line 12, in <module>
import htmllistparse
File "/tmp/pip-spp1jrqk-build/htmllistparse/__init__.py", line 1, in <module>
from .htmllistparse import *
File "/tmp/pip-spp1jrqk-build/htmllistparse/htmllistparse.py", line 10, in <module>
import bs4
ImportError: No module named 'bs4'
For those that dont need the mounting part of the lib can the fusepy be made optional.
This works pretty well. I'm having to run parse myself though because I need to set a custom timeout longer than 30s. Can you add an argument to fetch_listing to set a custom http read timeout?
您好,我最近想复现一下文言文翻译的SMT,自己做了一个平行语料数据集,但是感觉质量不太高,以及数量也比较少,看到您之前做过文言文的SMT,所以想求一份您文言文翻译项目中的85w平行语料数据集。实在找不到您的邮箱,只能来这里求一份数据了。若能不吝赐数据,我将不胜感激!!!
When -o option is not specified when running rehttpfs, the following error occurs:
Traceback (most recent call last):
File ".local/share/virtualenvs/gh-3UaJuPaC/bin/rehttpfs", line 11, in <module>
sys.exit(main())
File ".local/share/virtualenvs/gh-3UaJuPaC/lib/python3.6/site-packages/htmllistparse/rehttpfs.py", line 397, in main
**convert_fuse_options(args.o)
File ".local/share/virtualenvs/gh-3UaJuPaC/lib/python3.6/site-packages/htmllistparse/rehttpfs.py", line 49, in convert_fuse_options
for opt in options.split(','):
AttributeError: 'NoneType' object has no attribute 'split'
Python 3.2.3 (default, Nov 17 2016, 01:04:00)
[GCC 4.6.3] on linux2
Type "help", "copyright", "credits" or "license" for more information.
import htmllistparse
cwd, listing = htmllistparse.fetch_listing("my url")
Traceback (most recent call last):
File "", line 1, in
File "htmllistparse.py", line 237, in fetch_listing
req = requests.get(url, timeout=30)
File "/usr/local/lib/python3.2/dist-packages/requests/api.py", line 55, in get
return request('get', url, **kwargs)
File "/usr/local/lib/python3.2/dist-packages/requests/api.py", line 44, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.2/dist-packages/requests/sessions.py", line 422, in request
prep = self.prepare_request(req)
File "/usr/local/lib/python3.2/dist-packages/requests/sessions.py", line 360, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/usr/local/lib/python3.2/dist-packages/requests/models.py", line 294, in prepare
self.prepare_url(url, params)
File "/usr/local/lib/python3.2/dist-packages/requests/models.py", line 328, in prepare_url
url = str(url)
TypeError: 'tuple' object is not callable
Thanks if you can shed some light. Thanks in advance.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.