714 lines
26 KiB
Python
714 lines
26 KiB
Python
# Copyright 2019, Damian Johnson and The Tor Project
|
|
# See LICENSE for licensing information
|
|
|
|
"""
|
|
Descriptor archives are available from `CollecTor
|
|
<https://metrics.torproject.org/collector.html>`_. If you need Tor's topology
|
|
at a prior point in time this is the place to go!
|
|
|
|
With CollecTor you can either read descriptors directly...
|
|
|
|
.. literalinclude:: /_static/example/collector_reading.py
|
|
:language: python
|
|
|
|
... or download the descriptors to disk and read them later.
|
|
|
|
.. literalinclude:: /_static/example/collector_caching.py
|
|
:language: python
|
|
|
|
::
|
|
|
|
get_instance - Provides a singleton CollecTor used for...
|
|
|- get_server_descriptors - published server descriptors
|
|
|- get_extrainfo_descriptors - published extrainfo descriptors
|
|
|- get_microdescriptors - published microdescriptors
|
|
|- get_consensus - published router status entries
|
|
|
|
|
|- get_key_certificates - authority key certificates
|
|
|- get_bandwidth_files - bandwidth authority heuristics
|
|
+- get_exit_lists - TorDNSEL exit list
|
|
|
|
File - Individual file residing within CollecTor
|
|
|- read - provides descriptors from this file
|
|
+- download - download this file to disk
|
|
|
|
CollecTor - Downloader for descriptors from CollecTor
|
|
|- get_server_descriptors - published server descriptors
|
|
|- get_extrainfo_descriptors - published extrainfo descriptors
|
|
|- get_microdescriptors - published microdescriptors
|
|
|- get_consensus - published router status entries
|
|
|
|
|
|- get_key_certificates - authority key certificates
|
|
|- get_bandwidth_files - bandwidth authority heuristics
|
|
|- get_exit_lists - TorDNSEL exit list
|
|
|
|
|
|- index - metadata for content available from CollecTor
|
|
+- files - files available from CollecTor
|
|
|
|
.. versionadded:: 1.8.0
|
|
"""
|
|
|
|
import base64
|
|
import binascii
|
|
import datetime
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import shutil
|
|
import tempfile
|
|
import time
|
|
|
|
import stem.descriptor
|
|
import stem.util.connection
|
|
import stem.util.str_tools
|
|
|
|
from stem.descriptor import Compression, DocumentHandler
|
|
|
|
COLLECTOR_URL = 'https://collector.torproject.org/'
|
|
REFRESH_INDEX_RATE = 3600 # get new index if cached copy is an hour old
|
|
SINGLETON_COLLECTOR = None
|
|
|
|
YEAR_DATE = re.compile('-(\\d{4})-(\\d{2})\\.')
|
|
SEC_DATE = re.compile('(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})')
|
|
|
|
# distant future date so we can sort files without a timestamp at the end
|
|
|
|
FUTURE = datetime.datetime(9999, 1, 1)
|
|
|
|
|
|
def get_instance():
|
|
"""
|
|
Provides the singleton :class:`~stem.descriptor.collector.CollecTor`
|
|
used for this module's shorthand functions.
|
|
|
|
:returns: singleton :class:`~stem.descriptor.collector.CollecTor` instance
|
|
"""
|
|
|
|
global SINGLETON_COLLECTOR
|
|
|
|
if SINGLETON_COLLECTOR is None:
|
|
SINGLETON_COLLECTOR = CollecTor()
|
|
|
|
return SINGLETON_COLLECTOR
|
|
|
|
|
|
def get_server_descriptors(start = None, end = None, cache_to = None, bridge = False, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_server_descriptors`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_server_descriptors(start, end, cache_to, bridge, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
def get_extrainfo_descriptors(start = None, end = None, cache_to = None, bridge = False, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_extrainfo_descriptors`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_extrainfo_descriptors(start, end, cache_to, bridge, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
def get_microdescriptors(start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_microdescriptors`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_microdescriptors(start, end, cache_to, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
def get_consensus(start = None, end = None, cache_to = None, document_handler = DocumentHandler.ENTRIES, version = 3, microdescriptor = False, bridge = False, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_consensus`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_consensus(start, end, cache_to, document_handler, version, microdescriptor, bridge, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
def get_key_certificates(start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_key_certificates`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_key_certificates(start, end, cache_to, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
def get_bandwidth_files(start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_bandwidth_files`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_bandwidth_files(start, end, cache_to, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
def get_exit_lists(start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Shorthand for
|
|
:func:`~stem.descriptor.collector.CollecTor.get_exit_lists`
|
|
on our singleton instance.
|
|
"""
|
|
|
|
for desc in get_instance().get_exit_lists(start, end, cache_to, timeout, retries):
|
|
yield desc
|
|
|
|
|
|
class File(object):
|
|
"""
|
|
File within CollecTor.
|
|
|
|
:var str path: file path within collector
|
|
:var tuple types: descriptor types contained within this file
|
|
:var stem.descriptor.Compression compression: file compression, **None** if
|
|
this cannot be determined
|
|
:var int size: size of the file
|
|
:var str sha256: file's sha256 checksum
|
|
|
|
:var datetime start: first publication within the file, **None** if this
|
|
cannot be determined
|
|
:var datetime end: last publication within the file, **None** if this cannot
|
|
be determined
|
|
:var datetime last_modified: when the file was last modified
|
|
"""
|
|
|
|
def __init__(self, path, types, size, sha256, first_published, last_published, last_modified):
|
|
self.path = path
|
|
self.types = tuple(types) if types else ()
|
|
self.compression = File._guess_compression(path)
|
|
self.size = size
|
|
self.sha256 = sha256
|
|
self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M')
|
|
self._downloaded_to = None # location we last downloaded to
|
|
|
|
# Most descriptor types have publication time fields, but microdescriptors
|
|
# don't because these files lack timestamps to parse.
|
|
|
|
if first_published and last_published:
|
|
self.start = datetime.datetime.strptime(first_published, '%Y-%m-%d %H:%M')
|
|
self.end = datetime.datetime.strptime(last_published, '%Y-%m-%d %H:%M')
|
|
else:
|
|
self.start, self.end = File._guess_time_range(path)
|
|
|
|
def read(self, directory = None, descriptor_type = None, start = None, end = None, document_handler = DocumentHandler.ENTRIES, timeout = None, retries = 3):
|
|
"""
|
|
Provides descriptors from this archive. Descriptors are downloaded or read
|
|
from disk as follows...
|
|
|
|
* If this file has already been downloaded through
|
|
:func:`~stem.descriptor.collector.CollecTor.download' these descriptors
|
|
are read from disk.
|
|
|
|
* If a **directory** argument is provided and the file is already present
|
|
these descriptors are read from disk.
|
|
|
|
* If a **directory** argument is provided and the file is not present the
|
|
file is downloaded this location then read.
|
|
|
|
* If the file has neither been downloaded and no **directory** argument
|
|
is provided then the file is downloaded to a temporary directory that's
|
|
deleted after it is read.
|
|
|
|
:param str directory: destination to download into
|
|
:param str descriptor_type: `descriptor type
|
|
<https://metrics.torproject.org/collector.html#data-formats>`_, this is
|
|
guessed if not provided
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param stem.descriptor.__init__.DocumentHandler document_handler: method in
|
|
which to parse a :class:`~stem.descriptor.networkstatus.NetworkStatusDocument`
|
|
:param int timeout: timeout when connection becomes idle, no timeout
|
|
applied if **None**
|
|
:param int retries: maximum attempts to impose
|
|
|
|
:returns: iterator for :class:`~stem.descriptor.__init__.Descriptor`
|
|
instances in the file
|
|
|
|
:raises:
|
|
* **ValueError** if unable to determine the descirptor type
|
|
* **TypeError** if we cannot parse this descriptor type
|
|
* :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
if descriptor_type is None:
|
|
# If archive contains multiple descriptor types the caller must provide a
|
|
# 'descriptor_type' argument so we can disambiguate. However, if only the
|
|
# version number varies we can probably simply pick one.
|
|
|
|
base_types = set([t.split(' ')[0] for t in self.types])
|
|
|
|
if not self.types:
|
|
raise ValueError("Unable to determine this file's descriptor type")
|
|
elif len(base_types) > 1:
|
|
raise ValueError("Unable to disambiguate file's descriptor type from among %s" % ', '.join(self.types))
|
|
else:
|
|
descriptor_type = self.types[0]
|
|
|
|
if directory is None:
|
|
if self._downloaded_to and os.path.exists(self._downloaded_to):
|
|
directory = os.path.dirname(self._downloaded_to)
|
|
else:
|
|
# TODO: The following can be replaced with simpler usage of
|
|
# tempfile.TemporaryDirectory when we drop python 2.x support.
|
|
|
|
tmp_directory = tempfile.mkdtemp()
|
|
|
|
for desc in self.read(tmp_directory, descriptor_type, start, end, document_handler, timeout, retries):
|
|
yield desc
|
|
|
|
shutil.rmtree(tmp_directory)
|
|
|
|
return
|
|
|
|
path = self.download(directory, True, timeout, retries)
|
|
|
|
# Archives can contain multiple descriptor types, so parsing everything and
|
|
# filtering to what we're after.
|
|
|
|
for desc in stem.descriptor.parse_file(path, document_handler = document_handler):
|
|
if descriptor_type is None or descriptor_type.startswith(desc.type_annotation().name):
|
|
# TODO: This can filter server and extrainfo times, but other
|
|
# descriptor types may use other attribute names.
|
|
|
|
published = getattr(desc, 'published', None)
|
|
|
|
if published:
|
|
if start and published < start:
|
|
continue
|
|
elif end and published > end:
|
|
continue
|
|
|
|
yield desc
|
|
|
|
def download(self, directory, decompress = True, timeout = None, retries = 3, overwrite = False):
|
|
"""
|
|
Downloads this file to the given location. If a file already exists this is
|
|
a no-op.
|
|
|
|
:param str directory: destination to download into
|
|
:param bool decompress: decompress written file
|
|
:param int timeout: timeout when connection becomes idle, no timeout
|
|
applied if **None**
|
|
:param int retries: maximum attempts to impose
|
|
:param bool overwrite: if this file exists but mismatches CollecTor's
|
|
checksum then overwrites if **True**, otherwise rases an exception
|
|
|
|
:returns: **str** with the path we downloaded to
|
|
|
|
:raises:
|
|
* :class:`~stem.DownloadFailed` if the download fails
|
|
* **IOError** if a mismatching file exists and **overwrite** is **False**
|
|
"""
|
|
|
|
filename = self.path.split('/')[-1]
|
|
|
|
if self.compression != Compression.PLAINTEXT and decompress:
|
|
filename = filename.rsplit('.', 1)[0]
|
|
|
|
directory = os.path.expanduser(directory)
|
|
|
|
path = os.path.join(directory, filename)
|
|
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
|
|
# check if this file already exists with the correct checksum
|
|
|
|
if os.path.exists(path):
|
|
with open(path) as prior_file:
|
|
expected_hash = binascii.hexlify(base64.b64decode(self.sha256))
|
|
actual_hash = hashlib.sha256(prior_file.read()).hexdigest()
|
|
|
|
if expected_hash == actual_hash:
|
|
return path # nothing to do, we already have the file
|
|
elif not overwrite:
|
|
raise IOError("%s already exists but mismatches CollecTor's checksum (expected: %s, actual: %s)" % (path, expected_hash, actual_hash))
|
|
|
|
response = stem.util.connection.download(COLLECTOR_URL + self.path, timeout, retries)
|
|
|
|
if decompress:
|
|
response = self.compression.decompress(response)
|
|
|
|
with open(path, 'wb') as output_file:
|
|
output_file.write(response)
|
|
|
|
self._downloaded_to = path
|
|
return path
|
|
|
|
@staticmethod
|
|
def _guess_compression(path):
|
|
"""
|
|
Determine file comprssion from CollecTor's filename.
|
|
"""
|
|
|
|
for compression in (Compression.LZMA, Compression.BZ2, Compression.GZIP):
|
|
if path.endswith(compression.extension):
|
|
return compression
|
|
|
|
return Compression.PLAINTEXT
|
|
|
|
@staticmethod
|
|
def _guess_time_range(path):
|
|
"""
|
|
Attemt to determine the (start, end) time range from CollecTor's filename.
|
|
This provides (None, None) if this cannot be determined.
|
|
"""
|
|
|
|
year_match = YEAR_DATE.search(path)
|
|
|
|
if year_match:
|
|
year, month = map(int, year_match.groups())
|
|
start = datetime.datetime(year, month, 1)
|
|
|
|
if month < 12:
|
|
return (start, datetime.datetime(year, month + 1, 1))
|
|
else:
|
|
return (start, datetime.datetime(year + 1, 1, 1))
|
|
|
|
sec_match = SEC_DATE.search(path)
|
|
|
|
if sec_match:
|
|
# Descriptors in the 'recent/*' section have filenames with second level
|
|
# granularity. Not quite sure why, but since consensus documents are
|
|
# published hourly we'll use that as the delta here.
|
|
|
|
start = datetime.datetime.strptime(sec_match.group(1), '%Y-%m-%d-%H-%M-%S')
|
|
return (start, start + datetime.timedelta(seconds = 3600))
|
|
|
|
return (None, None)
|
|
|
|
|
|
class CollecTor(object):
|
|
"""
|
|
Downloader for descriptors from CollecTor. The contents of CollecTor are
|
|
provided in `an index <https://collector.torproject.org/index/index.json>`_
|
|
that's fetched as required.
|
|
|
|
:var int retries: number of times to attempt the request if downloading it
|
|
fails
|
|
:var float timeout: duration before we'll time out our request
|
|
"""
|
|
|
|
def __init__(self, retries = 2, timeout = None):
|
|
self.retries = retries
|
|
self.timeout = timeout
|
|
|
|
self._cached_index = None
|
|
self._cached_files = None
|
|
self._cached_index_at = 0
|
|
|
|
def get_server_descriptors(self, start = None, end = None, cache_to = None, bridge = False, timeout = None, retries = 3):
|
|
"""
|
|
Provides server descriptors published during the given time range, sorted
|
|
oldest to newest.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param bool bridge: standard descriptors if **False**, bridge if **True**
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.server_descriptor.ServerDescriptor` for the
|
|
given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
desc_type = 'server-descriptor' if not bridge else 'bridge-server-descriptor'
|
|
|
|
for f in self.files(desc_type, start, end):
|
|
for desc in f.read(cache_to, desc_type, start, end, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def get_extrainfo_descriptors(self, start = None, end = None, cache_to = None, bridge = False, timeout = None, retries = 3):
|
|
"""
|
|
Provides extrainfo descriptors published during the given time range,
|
|
sorted oldest to newest.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param bool bridge: standard descriptors if **False**, bridge if **True**
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor`
|
|
for the given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
desc_type = 'extra-info' if not bridge else 'bridge-extra-info'
|
|
|
|
for f in self.files(desc_type, start, end):
|
|
for desc in f.read(cache_to, desc_type, start, end, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def get_microdescriptors(self, start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Provides microdescriptors estimated to be published during the given time
|
|
range, sorted oldest to newest. Unlike server/extrainfo descriptors,
|
|
microdescriptors change very infrequently...
|
|
|
|
::
|
|
|
|
"Microdescriptors are expected to be relatively static and only change
|
|
about once per week." -dir-spec section 3.3
|
|
|
|
CollecTor archives only contain microdescriptors that *change*, so hourly
|
|
tarballs often contain very few. Microdescriptors also do not contain
|
|
their publication timestamp, so this is estimated.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.microdescriptor.Microdescriptor
|
|
for the given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
for f in self.files('microdescriptor', start, end):
|
|
for desc in f.read(cache_to, 'microdescriptor', start, end, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def get_consensus(self, start = None, end = None, cache_to = None, document_handler = DocumentHandler.ENTRIES, version = 3, microdescriptor = False, bridge = False, timeout = None, retries = 3):
|
|
"""
|
|
Provides consensus router status entries published during the given time
|
|
range, sorted oldest to newest.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param stem.descriptor.__init__.DocumentHandler document_handler: method in
|
|
which to parse a :class:`~stem.descriptor.networkstatus.NetworkStatusDocument`
|
|
:param int version: consensus variant to retrieve (versions 2 or 3)
|
|
:param bool microdescriptor: provides the microdescriptor consensus if
|
|
**True**, standard consensus otherwise
|
|
:param bool bridge: standard descriptors if **False**, bridge if **True**
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.router_status_entry.RouterStatusEntry`
|
|
for the given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
if version == 3 and not microdescriptor and not bridge:
|
|
desc_type = 'network-status-consensus-3'
|
|
elif version == 3 and microdescriptor and not bridge:
|
|
desc_type = 'network-status-microdesc-consensus-3'
|
|
elif version == 2 and not microdescriptor and not bridge:
|
|
desc_type = 'network-status-2'
|
|
elif bridge:
|
|
desc_type = 'bridge-network-status'
|
|
else:
|
|
if microdescriptor and version != 3:
|
|
raise ValueError('Only v3 microdescriptors are available (not version %s)' % version)
|
|
else:
|
|
raise ValueError('Only v2 and v3 router status entries are available (not version %s)' % version)
|
|
|
|
for f in self.files(desc_type, start, end):
|
|
for desc in f.read(cache_to, desc_type, start, end, document_handler, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def get_key_certificates(self, start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Directory authority key certificates for the given time range,
|
|
sorted oldest to newest.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.networkstatus.KeyCertificate
|
|
for the given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
for f in self.files('dir-key-certificate-3', start, end):
|
|
for desc in f.read(cache_to, 'dir-key-certificate-3', start, end, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def get_bandwidth_files(self, start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
Bandwidth authority heuristics for the given time range, sorted oldest to
|
|
newest.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.bandwidth_file.BandwidthFile
|
|
for the given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
for f in self.files('bandwidth-file', start, end):
|
|
for desc in f.read(cache_to, 'bandwidth-file', start, end, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def get_exit_lists(self, start = None, end = None, cache_to = None, timeout = None, retries = 3):
|
|
"""
|
|
`TorDNSEL exit lists <https://www.torproject.org/projects/tordnsel.html.en>`_
|
|
for the given time range, sorted oldest to newest.
|
|
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
:param str cache_to: directory to cache archives into, if an archive is
|
|
available here it is not downloaded
|
|
:param int timeout: timeout for downloading each individual archive when
|
|
the connection becomes idle, no timeout applied if **None**
|
|
:param int retries: maximum attempts to impose on a per-archive basis
|
|
|
|
:returns: **iterator** of
|
|
:class:`~stem.descriptor.tordnsel.TorDNSEL
|
|
for the given time range
|
|
|
|
:raises: :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
for f in self.files('tordnsel', start, end):
|
|
for desc in f.read(cache_to, 'tordnsel', start, end, timeout = timeout, retries = retries):
|
|
yield desc
|
|
|
|
def index(self, compression = 'best'):
|
|
"""
|
|
Provides the archives available in CollecTor.
|
|
|
|
:param descriptor.Compression compression: compression type to
|
|
download from, if undefiled we'll use the best decompression available
|
|
|
|
:returns: **dict** with the archive contents
|
|
|
|
:raises:
|
|
If unable to retrieve the index this provide...
|
|
|
|
* **ValueError** if json is malformed
|
|
* **IOError** if unable to decompress
|
|
* :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
if not self._cached_index or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE:
|
|
if compression == 'best':
|
|
for option in (Compression.LZMA, Compression.BZ2, Compression.GZIP, Compression.PLAINTEXT):
|
|
if option.available:
|
|
compression = option
|
|
break
|
|
elif compression is None:
|
|
compression = Compression.PLAINTEXT
|
|
|
|
extension = compression.extension if compression != Compression.PLAINTEXT else ''
|
|
url = COLLECTOR_URL + 'index/index.json' + extension
|
|
response = compression.decompress(stem.util.connection.download(url, self.timeout, self.retries))
|
|
|
|
self._cached_index = json.loads(stem.util.str_tools._to_unicode(response))
|
|
self._cached_index_at = time.time()
|
|
|
|
return self._cached_index
|
|
|
|
def files(self, descriptor_type = None, start = None, end = None):
|
|
"""
|
|
Provides files CollecTor presently has, sorted oldest to newest.
|
|
|
|
:param str descriptor_type: descriptor type or prefix to retrieve
|
|
:param datetime.datetime start: publication time to begin with
|
|
:param datetime.datetime end: publication time to end with
|
|
|
|
:returns: **list** of :class:`~stem.descriptor.collector.File`
|
|
|
|
:raises:
|
|
If unable to retrieve the index this provide...
|
|
|
|
* **ValueError** if json is malformed
|
|
* **IOError** if unable to decompress
|
|
* :class:`~stem.DownloadFailed` if the download fails
|
|
"""
|
|
|
|
if not self._cached_files or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE:
|
|
self._cached_files = sorted(CollecTor._files(self.index(), []), key = lambda x: x.start if x.start else FUTURE)
|
|
|
|
matches = []
|
|
|
|
for f in self._cached_files:
|
|
if start and (f.end is None or f.end < start):
|
|
continue # only contains descriptors before time range
|
|
elif end and (f.start is None or f.start > end):
|
|
continue # only contains descriptors after time range
|
|
|
|
if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f.types]):
|
|
matches.append(f)
|
|
|
|
return matches
|
|
|
|
@staticmethod
|
|
def _files(val, path):
|
|
"""
|
|
Recursively provies files within the index.
|
|
|
|
:param dict val: index hash
|
|
:param list path: path we've transversed into
|
|
|
|
:returns: **list** of :class:`~stem.descriptor.collector.File`
|
|
"""
|
|
|
|
if not isinstance(val, dict):
|
|
return [] # leaf node without any files
|
|
|
|
files = []
|
|
|
|
for k, v in val.items():
|
|
if k == 'files':
|
|
for attr in v:
|
|
file_path = '/'.join(path + [attr.get('path')])
|
|
files.append(File(file_path, attr.get('types'), attr.get('size'), attr.get('sha256'), attr.get('first_published'), attr.get('last_published'), attr.get('last_modified')))
|
|
elif k == 'directories':
|
|
for attr in v:
|
|
files.extend(CollecTor._files(attr, path + [attr.get('path')]))
|
|
|
|
return files
|