[tor-commits] [stem/master] Parse BandwidthFile as streams
atagar at torproject.org
atagar at torproject.org
Mon Jan 21 01:52:11 UTC 2019
commit f7575c6a1fb8b755a744c20318195c659a85d060
Author: Damian Johnson <atagar at torproject.org>
Date: Sun Jan 20 17:09:22 2019 -0800
Parse BandwidthFile as streams
String parsing was a great spot to start, but highly memory inefficient.
Parsing internally created multiple copies of our bandwidth file content
as processed rather than working from a single copy of the bytes.
---
stem/descriptor/bandwidth_file.py | 50 ++++++++++++++++++++-------------------
1 file changed, 26 insertions(+), 24 deletions(-)
diff --git a/stem/descriptor/bandwidth_file.py b/stem/descriptor/bandwidth_file.py
index 9c87385c..2e9fc216 100644
--- a/stem/descriptor/bandwidth_file.py
+++ b/stem/descriptor/bandwidth_file.py
@@ -15,6 +15,7 @@ Parsing for Bandwidth Authority metrics as described in Tor's
"""
import datetime
+import io
import time
import stem.util.str_tools
@@ -91,16 +92,17 @@ def _parse_file(descriptor_file, validate = False, **kwargs):
def _parse_header(descriptor, entries):
header = {}
- lines = str(descriptor).split('\n')
+ content = io.BytesIO(descriptor.get_bytes())
- # skip the first line, which should be the timestamp
+ content.readline() # skip the first line, which should be the timestamp
- if lines and lines[0].isdigit():
- lines = lines[1:]
+ while True:
+ line = content.readline().strip()
- for line in lines:
- if line == HEADER_DIV:
- break
+ if not line:
+ break # end of the content
+ elif line == HEADER_DIV:
+ break # end of header
elif line.startswith('node_id='):
break # version 1.0 measurement
@@ -117,7 +119,7 @@ def _parse_header(descriptor, entries):
def _parse_timestamp(descriptor, entries):
- first_line = str(descriptor).split('\n', 1)[0]
+ first_line = io.BytesIO(descriptor.get_bytes()).readline().strip()
if first_line.isdigit():
descriptor.timestamp = datetime.datetime.utcfromtimestamp(int(first_line))
@@ -129,30 +131,30 @@ def _parse_body(descriptor, entries):
# In version 1.0.0 the body is everything after the first line. Otherwise
# it's everything after the header's divider.
- div = '\n' if descriptor.version == '1.0.0' else HEADER_DIV
+ content = io.BytesIO(descriptor.get_bytes())
- if div in str(descriptor):
- body = str(descriptor).split(div, 1)[1].strip()
+ if descriptor.version == '1.0.0':
+ content.readline() # skip the first line
else:
- body = ''
+ while content.readline().strip() != HEADER_DIV:
+ pass # skip the header
measurements = {}
- if body:
- for line in body.split('\n'):
- attr = dict(_mappings_for('measurement', line))
+ for line in content.readlines():
+ attr = dict(_mappings_for('measurement', line.strip()))
- if 'node_id' not in attr:
- raise ValueError("Every meaurement must include 'node_id': %s" % line)
- elif attr['node_id'] in measurements:
- # Relay is listed multiple times. This is a bug for the bandwidth
- # authority that made this descriptor, but according to the spec
- # should be ignored by parsers.
+ if 'node_id' not in attr:
+ raise ValueError("Every meaurement must include 'node_id': %s" % line.strip())
+ elif attr['node_id'] in measurements:
+ # Relay is listed multiple times. This is a bug for the bandwidth
+ # authority that made this descriptor, but according to the spec
+ # should be ignored by parsers.
- continue
+ continue
- fingerprint = attr['node_id'].lstrip('$') # bwauths prefix fingerprints with '$'
- measurements[fingerprint] = attr
+ fingerprint = attr['node_id'].lstrip('$') # bwauths prefix fingerprints with '$'
+ measurements[fingerprint] = attr
descriptor.measurements = measurements
More information about the tor-commits
mailing list