Skip to content

Commit

Permalink
Merge pull request #6 from faradayio/collapse_duplicate_suffix
Browse files Browse the repository at this point in the history
Collapse duplicate suffixes  (fixes #2)
  • Loading branch information
emk authored Mar 27, 2020
2 parents e794032 + b092b6a commit 8f9a656
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 16 deletions.
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ script:
# Require clippy to pass without warnings. This also fails for regular Rust
# warnings.
- cargo clippy -- -D warnings
# Run our security, license and duplicate dependency version detector.
- cargo deny check
before_deploy:
#- cargo doc
- ./build-release geocode-csv "${TRAVIS_TAG}-${TRAVIS_OS_NAME}"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ FROM ekidd/rust-musl-builder:stable-openssl11
ADD . ./
RUN sudo chown -R rust:rust .

CMD cargo build --release
CMD cargo deny check && cargo build --release
39 changes: 33 additions & 6 deletions src/addresses.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,43 @@ impl ColumnKeyOrKeys<usize> {
) -> Result<Cow<'a, str>> {
match self {
ColumnKeyOrKeys::Key(key) => Ok(Cow::Borrowed(&record[*key])),
ColumnKeyOrKeys::Keys(keys) => Ok(Cow::Owned(
keys.iter()
.map(|key| &record[*key])
.collect::<Vec<_>>()
.join(" "),
)),
ColumnKeyOrKeys::Keys(keys) => {
// Allocate an empty string with some reserved space so we maybe don't
// need to reallocate it every time we append.
let mut extracted = String::with_capacity(40);
for key in keys {
let s = &record[*key];
if extracted.is_empty() {
extracted.push_str(s);
} else if extracted.ends_with(s) {
// Already there, so ignore it. This appears in a lot of
// real-world databases, for some reason.
} else {
extracted.push_str(" ");
extracted.push_str(s);
}
}
Ok(Cow::Owned(extracted))
}
}
}
}

#[test]
fn extract_collapses_duplicate_suffixes() {
// This seems really arbitrary, but it consistently appears in many
// real-world databases.
//
// I wonder if the equivalent "prefix" case is common?
use std::iter::FromIterator;
let record = StringRecord::from_iter(&["100", "Main Street #302", "#302"]);
let keys = ColumnKeyOrKeys::Keys(vec![0, 1, 2]);
assert_eq!(
keys.extract_from_record(&record).unwrap(),
"100 Main Street #302",
);
}

/// The column names from a CSV file that we want to use as addresses.
///
/// `K` is typically either a `String` (for a column name) or a `usize` (for a
Expand Down
2 changes: 1 addition & 1 deletion src/async_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ where
let thr = thread::Builder::new().name(thread_name);
let handle = thr
.spawn(move || {
if let Err(_) = block_on(sender.send(f())) {
if block_on(sender.send(f())).is_err() {
panic!("should always be able to send results from background thread");
}
})
Expand Down
8 changes: 2 additions & 6 deletions src/geocoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ fn read_csv_from_stdin(
let chunk_size = max(1, GEOCODE_SIZE / spec.prefix_count());

// Build our output headers.
let mut out_headers = in_headers.clone();
let mut out_headers = in_headers;
for prefix in spec.prefixes() {
structure.add_header_columns(prefix, &mut out_headers)?;
}
Expand Down Expand Up @@ -275,11 +275,7 @@ fn read_csv_from_stdin(
// rows that haven't been sent yet.
if !sent_chunk || !rows.is_empty() {
trace!("sending final {} input rows", rows.len());
block_on(tx.send(Message::Chunk(Chunk {
shared: shared.clone(),
rows,
})))
.map_err(|_| {
block_on(tx.send(Message::Chunk(Chunk { shared, rows }))).map_err(|_| {
format_err!("could not send rows to geocoder (perhaps it failed)")
})?;
}
Expand Down

0 comments on commit 8f9a656

Please sign in to comment.