delphi - strip out all non standard text characers from string

前端 未结 6 1387
-上瘾入骨i
-上瘾入骨i 2021-02-13 14:52

I need to strip out all non standard text characers from a string. I need remove all non ascii and control characters (except line feeds/carriage returns).

相关标签:
6条回答
  • 2021-02-13 15:15

    if you don't need to do it in-place, but generating a copy of the string, try this code

     type CharSet=Set of Char;
    
     function StripCharsInSet(s:string; c:CharSet):string;
      var i:Integer;
      begin
         result:='';
         for i:=1 to Length(s) do
           if not (s[i] in c) then 
             result:=result+s[i];
      end;  
    

    and use it like this

     s := StripCharsInSet(s,[#0..#9,#11,#12,#14..#31,#127]);
    

    EDIT: added #127 for DEL ctrl char.

    EDIT2: this is a faster version, thanks ldsandon

     function StripCharsInSet(s:string; c:CharSet):string;
      var i,j:Integer;
      begin
         SetLength(result,Length(s));
         j:=0;
         for i:=1 to Length(s) do
           if not (s[i] in c) then 
            begin
             inc(j);
             result[j]:=s[i];
            end;
         SetLength(result,j);
      end;  
    
    0 讨论(0)
  • 2021-02-13 15:19

    my performance solution;

    function StripNonAnsiChars(const AStr: String; const AIgnoreChars: TSysCharSet): string;
    var
      lBuilder: TStringBuilder;
      I: Integer;
    begin
      lBuilder := TStringBuilder.Create;
      try
        for I := 1 to AStr.Length do
          if CharInSet(AStr[I], [#32..#127] + AIgnoreChars) then
            lBuilder.Append(AStr[I]);
        Result := lBuilder.ToString;
      finally
        FreeAndNil(lBuilder);
      end;
    end;
    

    I wrote by delphi xe7

    0 讨论(0)
  • 2021-02-13 15:24

    And here's a variant of Cosmin's that only walks the string once, but uses an efficient allocation pattern:

    function StrippedOfNonAscii(const s: string): string;
    var
      i, Count: Integer;
    begin
      SetLength(Result, Length(s));
      Count := 0;
      for i := 1 to Length(s) do begin
        if ((s[i] >= #32) and (s[i] <= #127)) or (s[i] in [#10, #13]) then begin
          inc(Count);
          Result[Count] := s[i];
        end;
      end;
      SetLength(Result, Count);
    end;
    
    0 讨论(0)
  • 2021-02-13 15:25

    Something like this should do:

    // For those who need a disclaimer: 
    // This code is meant as a sample to show you how the basic check for non-ASCII characters goes
    // It will give low performance with long strings that are called often.
    // Use a TStringBuilder, or SetLength & Integer loop index to optimize.
    // If you need really optimized code, pass this on to the FastCode people.
    function StripNonAsciiExceptCRLF(const Value: AnsiString): AnsiString;
    var
      AnsiCh: AnsiChar;
    begin
      for AnsiCh in Value do
        if (AnsiCh >= #32) and (AnsiCh <= #127) and (AnsiCh <> #13) and (AnsiCh <> #10) then
          Result := Result + AnsiCh;
    end;
    

    For UnicodeString you can do something similar.

    0 讨论(0)
  • 2021-02-13 15:35

    Here's a version that doesn't build the string by appending char-by-char, but allocates the whole string in one go. It requires going over the string twice, once to count the "good" char, once to effectively copy those chars, but it's worth it because it doesn't do multiple reallocations:

    function StripNonAscii(s:string):string;
    var Count, i:Integer;
    begin
      Count := 0;
      for i:=1 to Length(s) do
        if ((s[i] >= #32) and (s[i] <= #127)) or (s[i] in [#10, #13]) then
          Inc(Count);
      if Count = Length(s) then
        Result := s // No characters need to be removed, return the original string (no mem allocation!)
      else
        begin
          SetLength(Result, Count);
          Count := 1;
          for i:=1 to Length(s) do
            if ((s[i] >= #32) and (s[i] <= #127)) or (s[i] in [#10, #13]) then
            begin
              Result[Count] := s[i];
              Inc(Count);
            end;
        end;
    end;
    
    0 讨论(0)
  • 2021-02-13 15:39

    my version with Result array of byte :

    interface

    type
      TSBox = array of byte;
    

    and the function :

    function StripNonAscii(buf: array of byte): TSBox;
    var temp: TSBox;
        countr, countr2: integer;
    const validchars : TSysCharSet = [#32..#127];
    begin
    if Length(buf) = 0 then exit;
    countr2:= 0;
    SetLength(temp, Length(buf)); //setze temp auf länge buff
    for countr := 0 to Length(buf) do if CharInSet(chr(buf[countr]), validchars) then
      begin
        temp[countr2] := buf[countr];
        inc(countr2); //count valid chars
      end;
    SetLength(temp, countr2);
    Result := temp;
    end;
    
    0 讨论(0)
提交回复
热议问题