Regex to remove single-line SQL comments (--)

前端 未结 7 986
悲哀的现实
悲哀的现实 2021-01-15 08:52

Question:

Can anybody give me a working regex expression (C#/VB.NET) that can remove single line comments from a SQL statement ?

I mean these comments:

7条回答
  •  时光说笑
    2021-01-15 09:27

    As a late solution, the simplest way is to do it using ScriptDom-TSqlParser:

    // https://michaeljswart.com/2014/04/removing-comments-from-sql/
    // http://web.archive.org/web/*/https://michaeljswart.com/2014/04/removing-comments-from-sql/
    public static string StripCommentsFromSQL(string SQL)
    {
        Microsoft.SqlServer.TransactSql.ScriptDom.TSql150Parser parser = 
            new Microsoft.SqlServer.TransactSql.ScriptDom.TSql150Parser(true);
    
        System.Collections.Generic.IList errors;
    
    
        Microsoft.SqlServer.TransactSql.ScriptDom.TSqlFragment fragments = 
            parser.Parse(new System.IO.StringReader(SQL), out errors);
    
        // clear comments
        string result = string.Join(
          string.Empty,
          fragments.ScriptTokenStream
              .Where(x => x.TokenType != Microsoft.SqlServer.TransactSql.ScriptDom.TSqlTokenType.MultilineComment)
              .Where(x => x.TokenType != Microsoft.SqlServer.TransactSql.ScriptDom.TSqlTokenType.SingleLineComment)
              .Select(x => x.Text));
    
        return result;
    
    }
    

    or instead of using the Microsoft-Parser, you can use ANTL4 TSqlLexer

    or without any parser at all:

    private static System.Text.RegularExpressions.Regex everythingExceptNewLines = 
        new System.Text.RegularExpressions.Regex("[^\r\n]");
    
    
    // http://drizin.io/Removing-comments-from-SQL-scripts/
    // http://web.archive.org/web/*/http://drizin.io/Removing-comments-from-SQL-scripts/
    public static string RemoveComments(string input, bool preservePositions, bool removeLiterals = false)
    {
        //based on http://stackoverflow.com/questions/3524317/regex-to-strip-line-comments-from-c-sharp/3524689#3524689
        var lineComments = @"--(.*?)\r?\n";
        var lineCommentsOnLastLine = @"--(.*?)$"; // because it's possible that there's no \r\n after the last line comment
                                                  // literals ('literals'), bracketedIdentifiers ([object]) and quotedIdentifiers ("object"), they follow the same structure:
                                                  // there's the start character, any consecutive pairs of closing characters are considered part of the literal/identifier, and then comes the closing character
        var literals = @"('(('')|[^'])*')"; // 'John', 'O''malley''s', etc
        var bracketedIdentifiers = @"\[((\]\])|[^\]])* \]"; // [object], [ % object]] ], etc
        var quotedIdentifiers = @"(\""((\""\"")|[^""])*\"")"; // "object", "object[]", etc - when QUOTED_IDENTIFIER is set to ON, they are identifiers, else they are literals
                                                              //var blockComments = @"/\*(.*?)\*/";  //the original code was for C#, but Microsoft SQL allows a nested block comments // //https://msdn.microsoft.com/en-us/library/ms178623.aspx
    
        //so we should use balancing groups // http://weblogs.asp.net/whaggard/377025
        var nestedBlockComments = @"/\*
                             (?>
                             /\*  (?)      # On opening push level
                             | 
                             \*/ (?<-LEVEL>)     # On closing pop level
                             |
                             (?! /\* | \*/ ) . # Match any char unless the opening and closing strings   
                             )+                         # /* or */ in the lookahead string
                             (?(LEVEL)(?!))             # If level exists then fail
                             \*/";
    
        string noComments = System.Text.RegularExpressions.Regex.Replace(input,
            nestedBlockComments + "|" + lineComments + "|" + lineCommentsOnLastLine + "|" + literals + "|" + bracketedIdentifiers + "|" + quotedIdentifiers,
            me => {
                if (me.Value.StartsWith("/*") && preservePositions)
                    return everythingExceptNewLines.Replace(me.Value, " "); // preserve positions and keep line-breaks // return new string(' ', me.Value.Length);
         else if (me.Value.StartsWith("/*") && !preservePositions)
                    return "";
                else if (me.Value.StartsWith("--") && preservePositions)
                    return everythingExceptNewLines.Replace(me.Value, " "); // preserve positions and keep line-breaks
         else if (me.Value.StartsWith("--") && !preservePositions)
                    return everythingExceptNewLines.Replace(me.Value, ""); // preserve only line-breaks // Environment.NewLine;
         else if (me.Value.StartsWith("[") || me.Value.StartsWith("\""))
                    return me.Value; // do not remove object identifiers ever
         else if (!removeLiterals) // Keep the literal strings
             return me.Value;
                else if (removeLiterals && preservePositions) // remove literals, but preserving positions and line-breaks
         {
                    var literalWithLineBreaks = everythingExceptNewLines.Replace(me.Value, " ");
                    return "'" + literalWithLineBreaks.Substring(1, literalWithLineBreaks.Length - 2) + "'";
                }
                else if (removeLiterals && !preservePositions) // wrap completely all literals
             return "''";
                else
                    throw new System.NotImplementedException();
            },
            System.Text.RegularExpressions.RegexOptions.Singleline | System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace);
        return noComments;
    }
    

提交回复
热议问题