How does Microsoft handle the fact that UTF-16 is a variable length encoding in their C++ standard library implementation

前端未结

关注

 5  2456

小蘑菇 2021-02-20 08:48

Having a variable length encoding is indirectly forbidden in the standard.

So I have several questions:

How is the following part of the standard handled?

5条回答

渐次进展 (楼主)

2021-02-20 09:25

MSVC stores wchar_t in wstrings. These can be interpreted as unicode 16 bit words, or anything else really.

If you want to get access to unicode characters or glyphs, you'll have to process said raw string by the unicode standard. You probably also want to handle common corner cases without breaking.

Here is a sketch of such a library. It is about half as memory efficient as it could be, but it does give you in-place access to unicode glyphs in a std::string. It relies on having a decent array_view class, but you want to write one of those anyhow:

struct unicode_char : array_view {
  using array_view::array_view;

  uint32_t value() const {
    if (size()==1)
      return front();
    Assert(size()==2);
    if (size()==2)
    {
      wchar_t high = front()-0xD800;
      wchar_T low = back()-0xDC00;
      return (uint32_t(high)<<10) + uint32_t(low);
    }
    return 0; // error
  }
  static bool is_high_surrogate( wchar_t c ) {
    return (c >= 0xD800 && c <= 0xDBFF);
  }
  static bool is_low_surrogate( wchar_t c ) {
    return (c >= 0xDC00 && c <= 0xDFFF);
  }
  static unicode_char extract( array_view raw )
  {
    if (raw.empty())
      return {};
    if (raw.size()==1)
      return raw;
    if (is_high_surrogate(raw.front()) && is_low_surrogate(*std::next(raw.begin())))
      return {raw.begin(), raw.begin()+2);
    return {raw.begin(), std::next(raw.begin())};
  }
};
static std::vector as_unicode_chars( array_view raw )
{
  std::vector retval;
  retval.reserve( raw.size() ); // usually 1:1
  while(!raw.empty())
  {
    retval.push_back( unicode_char::extract(raw) );
    Assert( retval.back().size() <= raw.size() );
    raw = {raw.begin() + retval.back().size(), raw.end()};
  }
  return retval;
}
struct unicode_glyph {
  std::array< unicode_char, 3 > buff;
  std::size_t count=0;
  unicode_char const* begin() const {
    return buff.begin();
  }
  unicode_char const* end() const {
    return buff.begin()+count;
  }
  std::size_t size() const { return count; }
  bool empty() { return size()==0; }
  unicode_char const& front() const { return *begin(); }
  unicode_char const& back() const { return *std::prev(end()); }
  array_view< unicode_char const > chars() const { return {begin(), end()}; }
  array_view< wchar_t const > wchars() const {
    if (empty()) return {};
    return { front().begin(), back().end() };
  }

  void append( unicode_char next ) {
    Assert(count<3);
    buff[count++] = next;
  }
  unicode_glyph() {}

  static bool is_diacrit(unicode_char c) const {
    auto v = c.value();
    return is_diacrit(v);
  }
  static bool is_diacrit(uint32_t v) const {
    return
      ((v >= 0x0300) && (v <= 0x0360))
    || ((v >= 0x1AB0) && (v <= 0x1AFF))
    || ((v >= 0x1DC0) && (v <= 0x1DFF))
    || ((v >= 0x20D0) && (v <= 0x20FF))
    || ((v >= 0xFE20) && (v <= 0xFE2F));
  }
  static size_t diacrit_count(unicode_char c) const {
    auto v = c.value();
    if (is_diacrit(v))
      return 1 + ((v >= 0x035C)&&(v<=0x0362));
    else
      return 0;
  }
  static unicode_glyph extract( array_view raw ) {
    unicode_glyph retval;
    if (raw.empty())
      return retval;
    if (raw.size()==1)
    {
      retval.append(raw.front());
      return retval;
    }
    retval.count = diacrit_count( *std::next(raw.begin()) )+1;
    std::copy( raw.begin(), raw.begin()+retval.count, retval.buff.begin() );
    return retval;
  }
};
static std::vector as_unicode_glyphs( array_view raw )
{
  std::vector retval;
  retval.reserve( raw.size() ); // usually 1:1
  while(!raw.empty())
  {
    retval.push_back( unicode_glyph::extract(raw) );
    Assert( retval.back().size() <= raw.size() );
    raw = {raw.begin() + retval.back().size(), raw.end()};
  }
  return retval;
}
static std::vector as_unicode_glyphs( array_view raw )
{
  return as_unicode_glyphs( as_unicode_chars( raw ) );
}

a smarter bit of code would generate the unicode_chars and unicode_glyphs on the fly with a factory iterator of some kind. A more compact implementation would keep track of the fact that the end pointer of the previous and begin pointer of the next are always identical, and alias them together. Another optimization would be to use a small object optimization on glyph based off the assumption that most glyphs are one character, and use dynamic allocation if they are two.

Note that I treat CGJ as a standard diacrit, and the double-diacrits as a set of 3 characters that form one (unicode), but half-diacrits don't merge things into one glyph. These are all questionable choices.

This was written in a bout of insomnia. Hope it at least somewhat works.

0 讨论(0)

查看其它5个回答