From 79c26253bbd7d78156b85fc3014c16786edc15af Mon Sep 17 00:00:00 2001 From: Till Höppner Date: Thu, 25 Feb 2016 18:08:08 +0100 Subject: Merge alias to ease usage of sort+dedup --- Cargo.toml | 9 ++++++--- cli/src/lib.rs | 20 ++++++++++++++++++++ ops/src/freq.rs | 5 +++++ ops/src/lib.rs | 26 +++++++++++++++++++++++++- 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0178b8b..44479cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [package] authors = ["Till Hoeppner "] description = "IRC log converter/collector/cruncher" -homepage = "https://github.com/tilpner/ilc" license = "Apache-2.0" name = "ilc" repository = "https://github.com/tilpner/ilc" @@ -13,9 +12,13 @@ doc = false [dependencies] ilc-cli = "0.1.0" +ilc-base = "0.1.0" +ilc-ops = "0.1.0" +ilc-format-weechat = "0.1.0" +ilc-format-energymech = "0.1.0" [profile.release] -debug = false +debug = true debug-assertions = false -lto = true +# lto = true opt-level = 3 diff --git a/cli/src/lib.rs b/cli/src/lib.rs index 0f91b89..40bc2e7 100644 --- a/cli/src/lib.rs +++ b/cli/src/lib.rs @@ -127,6 +127,9 @@ pub fn main() { .subcommand(SubCommand::with_name("sort").about("Sorts a log by time")) .subcommand(SubCommand::with_name("dedup") .about("Removes duplicate log entries in close proximity")) + .subcommand(SubCommand::with_name("merge") + .about("Merges the input logs. This has to keep everything \ + in memory")) .get_matches(); let res = match args.subcommand() { @@ -177,6 +180,23 @@ pub fn main() { &mut *e.output(), &*e.encoder()) } + ("merge", Some(args)) => { + // TODO: avoid (de-)serialization to weechat + let e = Environment(&args); + let (ctx, i, d, o, e) = (&e.context(), + &mut e.input(), + &mut *e.decoder(), + &mut *e.output(), + &*e.encoder()); + let mut buffer = Vec::new(); + match sort::sort(ctx, i, d, &mut buffer, &Weechat) { + Err(e) => error(Box::new(e)), + _ => (), + } + let mut read = io::Cursor::new(&buffer); + dedup::dedup(ctx, &mut read, &mut Weechat, o, e) + + } (sc, _) if !sc.is_empty() => panic!("Unimplemented subcommand `{}`, this is a bug", sc), _ => die("No command specified"), }; diff --git a/ops/src/freq.rs b/ops/src/freq.rs index c5b363a..4a02d4c 100644 --- a/ops/src/freq.rs +++ b/ops/src/freq.rs @@ -1,3 +1,5 @@ +//! Per-nick word/line statistics + use ilc_base::{self, Context, Decode, Event}; use ilc_base::event::Type; @@ -35,6 +37,9 @@ fn strip_nick_prefix(s: &str) -> &str { } // TODO: Don't print results, return Stats struct +/// Return the `count` most active nicks, with lines, words and words per lines calculated. +/// +/// `usize::MAX` is a good default if you don't want to cap the statistics. pub fn freq(count: usize, ctx: &Context, input: &mut BufRead, diff --git a/ops/src/lib.rs b/ops/src/lib.rs index 8cd5607..e5d92cb 100644 --- a/ops/src/lib.rs +++ b/ops/src/lib.rs @@ -4,9 +4,13 @@ extern crate ilc_base; mod ageset; pub mod freq; +/// No-op log parsing pub mod parse { use ilc_base::{self, Context, Decode}; use std::io::BufRead; + + /// Simply parse the input, without further validation or conversion. No information is stored. + /// This will return `Err` if the decoder yields `Err`. pub fn parse(ctx: &Context, input: &mut BufRead, decoder: &mut Decode) -> ilc_base::Result<()> { for e in decoder.decode(&ctx, input) { try!(e); @@ -15,10 +19,14 @@ pub mod parse { } } +/// Log format conversion pub mod convert { use ilc_base::{self, Context, Decode, Encode}; use std::io::{BufRead, Write}; + /// Convert from one format to another, not necessarily different, format. In combination with a + /// timezone offset, this can be used to correct the timestamps. + /// Will return `Err` and abort conversion if the decoder yields `Err` or re-encoding fails. pub fn convert(ctx: &Context, input: &mut BufRead, decoder: &mut Decode, @@ -32,10 +40,14 @@ pub mod convert { } } +/// Last-seen of nicks pub mod seen { use ilc_base::{self, Context, Decode, Encode, Event}; use std::io::{BufRead, Write}; + /// Return the last message of a given nickname, searching from the beginning of the logs. + /// Will return `Err` if the decoder yields `Err`. This relies on absolute timestamps, and + /// behaviour without full dates is undefined. pub fn seen(nick: &str, ctx: &Context, input: &mut BufRead, @@ -59,10 +71,17 @@ pub mod seen { } } +/// Internal (as opposed to external, not to be confused with private) log sorting pub mod sort { use ilc_base::{self, Context, Decode, Encode, Event}; use std::io::{BufRead, Write}; + /// **Memory-intensive** + /// Sort the input, discarding faulty events. This will + /// read *all events* into memory, then sort them by time and write them back. + /// Behaviour is undefined if events lack full date information. + /// + /// *This should be an external merge-sort, but is a placeholder until implementation* pub fn sort(ctx: &Context, input: &mut BufRead, decoder: &mut Decode, @@ -81,6 +100,7 @@ pub mod sort { } } +/// Event deduplication pub mod dedup { use std::io::{BufRead, Write}; use std::hash::{Hash, Hasher}; @@ -88,7 +108,7 @@ pub mod dedup { use ilc_base::{self, Context, Decode, Encode, Event}; #[derive(Clone, Debug, PartialEq, Eq)] - pub struct NoTimeHash<'a>(pub Event<'a>); + struct NoTimeHash<'a>(pub Event<'a>); impl<'a> Hash for NoTimeHash<'a> { fn hash(&self, state: &mut H) @@ -99,6 +119,10 @@ pub mod dedup { } } + /// Deduplicate subsequent identical elements, e.g. after a sorting + /// operation. This will **not** read all events into memory, and only + /// operate on a short window of events. Therefore, it'll only work correctly + /// on sorted or very short logs. pub fn dedup(ctx: &Context, input: &mut BufRead, decoder: &mut Decode, -- cgit v1.2.3