MarkLogic log files denote mulitple lines by printing the lines with the same timestamp down to the millisecond. You can see that in this snippet:
2014-05-09 18:
This seemed like an interesting problem, so I wrote a plugin for it, based loosely on the multiline plugin, but there are some complications. There is a bug in the core logstash code that needs to be fixed for the plugin though.
Extract the following into lib/filters/related.rb
# encoding: utf-8
require "logstash/filters/base"
require "logstash/namespace"
require "set"
#
# This filter will collapse multiline messages from a single source into one Logstash event.
# if they are related based on a pattern defined. Two events are related if the pattern
# match returns the same thing for consecutive events.
#
# The config looks like this:
#
# filter {
# related {
# type => "type"
# pattern => "^%{TIMESTAMP:time}"
# capture => "TIMESTAMP:time"
# }
# }
#
# The `pattern` is the pattern that is used to match the lines
# The `capture` is the named capture that has to match between the lines
#
class LogStash::Filters::Related < LogStash::Filters::Base
config_name "related"
milestone 1
# The regular expression to match.
config :pattern, :validate => :string, :required => true
config :capture, :validate => :string, :required => true
# The stream identity is how the multiline filter determines which stream an
# event belongs to. This is generally used for differentiating, say, events
# coming from multiple files in the same file input, or multiple connections
# coming from a tcp input.
#
# The default value here is usually what you want, but there are some cases
# where you want to change it. One such example is if you are using a tcp
# input with only one client connecting at any time. If that client
# reconnects (due to error or client restart), then logstash will identify
# the new connection as a new stream and break any multiline goodness that
# may have occurred between the old and new connection. To solve this use
# case, you can use "%{@source_host}.%{@type}" instead.
config :stream_identity , :validate => :string, :default => "%{host}.%{path}.%{type}"
# Logstash ships by default with a bunch of patterns, so you don't
# necessarily need to define this yourself unless you are adding additional
# patterns.
#
# Pattern files are plain text with format:
#
# NAME PATTERN
#
# For example:
#
# NUMBER \d+
config :patterns_dir, :validate => :array, :default => []
# Detect if we are running from a jarfile, pick the right path.
@@patterns_path = Set.new
if __FILE__ =~ /file:\/.*\.jar!.*/
@@patterns_path += ["#{File.dirname(__FILE__)}/../../patterns/*"]
else
@@patterns_path += ["#{File.dirname(__FILE__)}/../../../patterns/*"]
end
public
def initialize(config = {})
super
@threadsafe = false
# This filter needs to keep state.
@types = Hash.new { |h,k| h[k] = [] }
@pending_unmatched = Hash.new
@pending = Hash.new
@previous = Hash.new
end # def initialize
public
def register
require "grok-pure" # rubygem 'jls-grok'
@grok = Grok.new
@patterns_dir = @@patterns_path.to_a + @patterns_dir
@patterns_dir.each do |path|
# Can't read relative paths from jars, try to normalize away '../'
while path =~ /file:\/.*\.jar!.*\/\.\.\//
# replace /foo/bar/../baz => /foo/baz
path = path.gsub(/[^\/]+\/\.\.\//, "")
end
if File.directory?(path)
path = File.join(path, "*")
end
Dir.glob(path).each do |file|
@logger.info("Grok loading patterns from file", :path => file)
@grok.add_patterns_from_file(file)
end
end
@grok.compile(@pattern)
@logger.debug("Registered multiline plugin", :type => @type, :config => @config)
end # def register
public
def filter(event)
key = event.sprintf(@stream_identity)
## if there's anything pending unmatched, we need to push it out
## and then push the event back on the filter queue (yield)
if @pending_unmatched[key]
@logger.info("Related", :unmatched => key)
clone = event.clone
event.overwrite(@pending_unmatched[key])
@pending_unmatched.delete(key)
yield clone
return
end
return unless filter?(event);
if event["message"].is_a?(Array)
match = @grok.match(event["message"].first)
else
match = @grok.match(event["message"])
end
pending = @pending[key]
@logger.debug("Related", :pattern => @pattern, :message => event["message"],
:match => match, :capture => @capture)
if !match
if pending
@pending_unmatched[key] = event.clone
event.overwrite(pending)
@pending.delete(key)
collapse_event!(event)
filter_matched(event)
end
return
end
## from here out, we've matched
if pending
if match.captures[@capture] == @previous[key]
pending.append(event)
pending.tag "related"
event.cancel
else
@pending[key] = event.clone
@previous[key] = match.captures[@capture]
event.overwrite(pending)
collapse_event!(event)
filter_matched(event)
end
else
@pending[key] = event
@previous[key] = match.captures[@capture]
event.cancel
end
end # def filter
# Flush any pending messages. This is generally used for unit testing only.
#
# Note: flush is disabled now; it is preferable to use the multiline codec.
public
def __flush
events = []
@pending.each do |key, value|
value.uncancel
events << value
end
@pending.clear
return events
end # def flush
def collapse_event!(event)
event["message"] = event["message"].join("\n") if event["message"].is_a?(Array)
event["@timestamp"] = event["@timestamp"].first if event["@timestamp"].is_a?(Array)
event
end
end # class LogStash::Filters::Related
And then modify lib/event.rb to add the commented line
public
def overwrite(event)
@data = event.to_hash
#convert timestamp if it is a String
if @data[TIMESTAMP].is_a?(String)
@data[TIMESTAMP] = LogStash::Time.parse_iso8601(@data[TIMESTAMP])
end
# add the line below. This is needed because if we are overwriting data["message"]
# with an array, we need to regenerate the accessors otherwise it uses the message
# before it was overwritten
@accessors = LogStash::Util::Accessors.new(@data)
end