问题
I am copying data from MySQL to SQL Server using a linked server.
SELECT comment FROM openquery(my_linked_server, 'SELECT comment FROM search_data');
The text in the MySQL table column is xxx 🤘 xxx
. By time I receive it in SQL Server it is xxx 🤘 xxx
. The MySQL table is utf8mb4
, and I have set up the ODBC config for the linked server to use this. I am using MySQL ODBC 5.3.13
Any advice would be appreciated. the SQL Server version is 2016, I have seen examples to put do
select N'🤘'
etc, but don't know how to apply this to the above query.
🤘 seems to be 4 characters
ð = u00f0 , dec = 240
Ÿ = u0178 , dec = 276
¤ = u00a4 , dec = 164
˜ = u02dc , dec = 732
🤘= ud83e, dec = 55358
funny enough this doesnt even work
select nchar(unicode(N'🤘')),unicode(N'🤘')
returning � symbol
回答1:
The Unicode codepoint of the character 🤘 is U+1F918, which means it is outside the Basic Multilingual Plane (BMP) of Unicode, which covers codepoints up to U+FFFF.
To process Unicode characters outside the BMP, you need to apply collations supporting Supplementary Characters, named as *_SC
:
SQL Server 2012 (11.x) introduced a new family of supplementary character (_SC) collations that can be used with the nchar, nvarchar, and sql_variant data types to represent the full Unicode character range (000000–10FFFF)
Compare the results of this SQL statement
select
nchar(unicode(N'🤘' collate Latin1_General_100_CI_AS_SC)) as EmojiSC,
unicode(N'🤘' collate Latin1_General_100_CI_AS_SC) as EmojiSCUnicode,
cast(N'🤘' as varbinary) as EmojiBinary,
cast(nchar(unicode(N'🤘')) as varbinary) as EmojiConvBinary,
unicode(N'🤘') as EmojiUnicode
as run against a database using Latin1_General_CI_AS
EmojiSC EmojiSCUnicode EmojiBinary EmojiConvBinary EmojiUnicode
NULL 129304 0x3ED818DD 0x3ED8 55358
versus a database set to Latin1_General_100_CI_AI_SC
EmojiSC EmojiSCUnicode EmojiBinary EmojiConvBinary EmojiUnicode
🤘 129304 0x3ED818DD 0x3ED818DD 129304
Why do you see "🤘
"?
The UTF-8 encoding of U+1F918 is 0xF0 0x9F 0xA4 0x98, and the characters are the result of interpreting these codes as ANSI characters.
Why do you see "�"?
The character � is the Unicode REPLACEMENT CHARACTER and is
used to replace an unknown, unrecognized or unrepresentable character
and that's because U+D83E is not a valid Unicode codepoint, but the first word of the codepoint encoded as UTF-16 (0xD83E 0xDD18
).
Check what is stored, not what is displayed
Displaying Unicode data can be tricky, and the most efficient way to find out what's going on under the hood is to look at the bytes. In TSQL, use cast(... as varbinary)
to analyze where Unicode data manipulation goes wrong.
回答2:
i made a solution and posting so others dont spend the day doing the same
select ab_test.dbo.GetEmojisInString('👌💖🤷â€â™‚ï¸ðŸ˜ŽðŸ±â€ðŸ’»ðŸ˜‰â¤ðŸ±â€ðŸ‘¤ðŸ¤žðŸ¤£ðŸ‘💕✌ðŸ±â€ðŸðŸ’‹ðŸŽ‚🎉🤦â€â™‚ï¸ðŸ˜ŠðŸŒ¹ðŸ‘ðŸ±â€ðŸ‰ðŸŽ¶ðŸ˜ðŸ¤¦â€â™€ï¸ðŸ˜ðŸ™ŒðŸ±â€ðŸš€ðŸ˜œðŸ˜˜ðŸ±â€ðŸ‘“😢😒🤳😂')
will return
👌💖🤷♂️😎🐱💻😉❤🐱👤🤞🤣👏💕✌🐱🏍💋🎂🎉🤦♂️😊🌹👍🐱🐉🎶😍🤦♀️😁🙌🐱🚀😜😘🐱👓😢😒🤳😂
there are 5 functions below, probrably not perfect and maybe shorter / better ways but this functions. if any bugs let me know.
NOTE : i had to split over two databases as for this to work the collation needs to have _CS, and the bi_library database below in my solution i could not change this as database was locked so for now just created a ab_test db.
USE [bi_library]
GO
CREATE FUNCTION [dbo].[GetDecimalFromOtherBase]
( @p_in_value varchar(100),
@p_from_base int -- ie 16 for hex, 8 for octal, 2 for bin
) returns int
as
begin
declare @l_in_value varchar(100) = reverse(@p_in_value) -- spin backwards as maths works in easier this way
declare @l_from_base varchar(100) = @p_from_base--@p_from_base --= @p_in_value
declare @l_pos int = 1
declare @l_char char(1)
declare @l_val int = 0
declare @l_total int = 0
while @l_pos<= len(@l_in_value)
begin
set @l_char = substring(@l_in_value,@l_pos,1)
if isnumeric(@l_char)=0
begin
set @l_val = ascii(@l_char)-55 -- convert A to 10, F to 15 etc
end
else
begin
set @l_val = @l_char
end
set @l_total = @l_total + (power(@l_from_base,@l_pos-1)*@l_val)
set @l_pos=@l_pos+1
end
return @l_total
end
GO
CREATE FUNCTION [dbo].[GetOtherBaseFromDecimal]
( @p_in_value int,
@p_to_base int -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
-- convert decimal to other base
declare @l_dec int = @p_in_value
declare @l_ret_str varchar(100) = ''
declare @l_rem int = 0
declare @l_rem_char char(1) = '?'
while @l_dec > 0
begin
set @l_rem = @l_dec % @p_to_base
if @l_rem >= 10
begin
set @l_rem_char = char(55+@l_rem)
end
else
begin
set @l_rem_char = cast(@l_rem as varchar)
end
set @l_ret_str = @l_ret_str + @l_rem_char
set @l_dec = @l_dec / @p_to_base
end
return reverse(@l_ret_str)
end
GO
CREATE FUNCTION [dbo].[GetBaseFromOtherBase]
( @p_in_value varchar(100),
@p_in_base bigint, -- ie 16 for hex, 8 for octal, 2 for bin
@p_to_base bigint -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
return bi_library.dbo.GetOtherBaseFromDecimal(bi_library.dbo.GetDecimalFromOtherBase(@p_in_value,@p_in_base),@p_to_base)
end
GO
USE [ab_test]
GO
ALTER function [dbo].[GetEmojisInString] (@p_in_string nvarchar(max)) returns nvarchar(max)
as
begin
declare @l_string varchar(1000) = @p_in_string --'✌ðŸ˜ðŸ’‹ðŸ¤·â€â™‚ï¸ðŸ¤³ðŸ±â€ðŸ‘“ðŸ±â€ðŸš€ðŸ±â€ðŸ‰ðŸ˜ŠðŸ’•ðŸ¤žðŸ˜‰ðŸ‘ŒðŸ¤¦â€â™€ï¸ðŸ±â€ðŸðŸ’–😒😘ðŸ˜ðŸ‘🤦â€â™‚ï¸ðŸ‘ðŸ±â€ðŸ‘¤ðŸ±â€ðŸ’»ðŸ™ŒðŸŽ‚😎😂😢😜🎶🌹🎉🤣â¤ðŸ¤·â€â™€ï¸'
declare @l_pos int = 1
declare @l_char varchar(1)
declare @l_cont_extended_ascii int = 0
declare @l_byte1_hex varchar(2)
declare @l_byte2_hex varchar(2)
declare @l_byte3_hex varchar(2)
declare @l_byte4_hex varchar(2)
declare @l_hex_char varchar(2)
declare @l_str nvarchar(max) = ''
declare @l_dec_value_found int
while @l_pos <= len(@l_string)
begin
set @l_char = substring(@l_string,@l_pos,1)
--print(ascii(@l_char))
if ascii(@l_char)>=128
begin
set @l_cont_extended_ascii = @l_cont_extended_ascii+1
--print(@l_char)
set @l_hex_char = bi_library.dbo.GetOtherBaseFromDecimal(ascii(@l_char),16)
if @l_cont_extended_ascii = 1
begin
set @l_byte1_hex = @l_hex_char
--print('set byte 1')
end
else if @l_cont_extended_ascii = 2
begin
--print('set byte 2')
set @l_byte2_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 128/*U+0080*/ and 2047/*U+07FF */
begin
--print('2 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
end
else if @l_cont_extended_ascii = 3
begin
--print('set byte 3')
set @l_byte3_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,4))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 2048/*U+0800*/ and 65535/*U+FFFF*/
begin
--print('3 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
--print(@l_str)
end
else if @l_cont_extended_ascii = 4 begin set @l_byte4_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,3))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte4_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 65536/*U+10000*/ and 1114111/*U+10FFFF*/
begin
--print('4 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
else
begin
--print('out of range byte emoji found')
set @l_str = @l_str+@l_char
end
--print(@l_str)
--end
set @l_cont_extended_ascii = 0
end
end
else
begin
--print('snapping')
set @l_str = @l_str+@l_char
set @l_cont_extended_ascii = 0
--print(@l_str)
end
set @l_pos = @l_pos+1
end
--print(@l_str)
return @l_str
end
CREATE function [dbo].[HasEmojisInString] (@p_in_string nvarchar(max)) returns int
as
begin
declare @l_string_emojified varchar(1000)
set @l_string_emojified = dbo.GetEmojisInString(@p_in_string)
if @l_string_emojified <> @p_in_string
begin
return 1
end
return 0
end
GO
来源:https://stackoverflow.com/questions/58551286/copying-emojis-in-text-from-mysql-to-sql-server