T-SQL-Algorithmus zum Codieren unsicherer HTML-Zeichen als HTML-Zeichenentitätsreferenzen

Kodierung von fünf Sonderzeichen mit rekursivem CTE:

DECLARE 
  @unsafe NVARCHAR(MAX),
  @safe   NVARCHAR(MAX) 

--
-- Create the unsafe html string
-- 
SET @unsafe = N'html''s encoding "method" is <= or >= & 1234 ' + NCHAR(129) 
--
-- Use a recursive CTE to iterate through each character in the string
-- 
;WITH cte AS 
(
  -- 
  -- The first row will contain the original
  -- string, an empty string to be used to 
  -- build the "safe" string, and a position
  -- column to mark the character position
  -- of the loop
  -- 
  SELECT 
    @unsafe AS unsafe_html,
    CONVERT(NVARCHAR(MAX), '') AS safe_html,
    1 AS pos
  WHERE @unsafe IS NOT NULL AND LEN(@unsafe) > 0 
  UNION ALL
  -- 
  -- Create a loop: 
  -- The anchor row starts at position one.
  -- Increment the position by one for each pass.
  -- Stop when the position value is equal to the string lenth.
  -- Evaluate the character in each string
  -- If the ASCII value > 128, use the &# format.
  -- Otherwise, check for 5 special characters: " & ' < >
  -- Use the encoding reference or just the original character
  --
  SELECT 
    @unsafe AS unsafe_html,
    CONVERT(NVARCHAR(MAX), safe_html + 
    CASE WHEN UNICODE(SUBSTRING(unsafe_html, pos, 1)) > 128 
         THEN '&#' + CONVERT(NVARCHAR(10), UNICODE(SUBSTRING(unsafe_html, pos, 1)))  
         ELSE CASE SUBSTRING(unsafe_html, pos, 1)
              WHEN '"'  THEN '&quot' 
              WHEN '&'  THEN '&amp'
              WHEN '''' THEN '&apos'
              WHEN '<'  THEN '&lt'
              WHEN '>'  THEN '&gt'
              ELSE SUBSTRING(unsafe_html, pos, 1)
              END 
         END ) AS safe_html,
    pos + 1 AS pos
  FROM cte
  WHERE pos <= LEN(@unsafe)
) 
--
-- Each pass through the string creates a row in the CTE
-- The last row will have the position value of the string length + 1
-- Use that row as the safe html string
-- SQL Server allows a max recursion of 32767
-- 
SELECT @safe = (
  SELECT safe_html 
  FROM cte
  WHERE pos = LEN(@unsafe) + 1
) 
OPTION (MAXRECURSION 32767) 

SELECT @safe

-- html&aposs encoding &quotmethod&quot is &lt= or &gt= &amp 1234 &#129

Erste Version:

DECLARE @s NVARCHAR(100)

SET @s = '<html>unsafe & safe<html>'
SELECT @s 
SELECT (SELECT @s FOR XML PATH(''))

---------------------------------------
<html>unsafe & safe<html>

-----------------------------------------
&lt;html&gt;unsafe &amp; safe&lt;html&gt;

Vollständige Codierung mit allen offiziellen Referenzen:

DECLARE 
    @unsafe  NVARCHAR(MAX),
    @safe NVARCHAR(MAX) 

-- Build string with first 10,000 unicode chars
SELECT @unsafe = COALESCE(@unsafe, '') + NCHAR(number) + ' ' 
FROM (
    SELECT TOP 10000 ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS number
    FROM sys.all_objects s1 CROSS JOIN sys.all_objects s2 
) t

-- Build table variable with character entity references defined in HTML 4.0
-- Reference: https://www.htmlcodetutorial.com/characterentities_famsupp_69.html
DECLARE @t TABLE (
    name NVARCHAR(25) NOT NULL, 
    unicode_val INT NOT NULL PRIMARY KEY 
) 

INSERT @t 
VALUES
('&quot', 34),
('&amp', 38),
('&apos', 39),
('&lt', 60),
('&gt', 62),
('&nbsp', 160),
('&iexcl', 161),
('&cent', 162),
('&pound', 163),
('&curren', 164),
('&yen', 165),
('&brvbar', 166),
('&sect', 167),
('&uml', 168),
('&copy', 169),
('&ordf', 170),
('&laquo', 171),
('&not', 172),
('&shy', 173),
('&reg', 174),
('&macr', 175),
('&deg', 176),
('&plusmn', 177),
('&sup2', 178),
('&sup3', 179),
('&acute', 180),
('&micro', 181),
('&para', 182),
('&middot', 183),
('&cedil', 184),
('&sup1', 185),
('&ordm', 186),
('&raquo', 187),
('&frac14', 188),
('&frac12', 189),
('&frac34', 190),
('&iquest', 191),
('&Agrave', 192),
('&Aacute', 193),
('&Acirc', 194),
('&Atilde', 195),
('&Auml', 196),
('&Aring', 197),
('&AElig', 198),
('&Ccedil', 199),
('&Egrave', 200),
('&Eacute', 201),
('&Ecirc', 202),
('&Euml', 203),
('&Igrave', 204),
('&Iacute', 205),
('&Icirc', 206),
('&Iuml', 207),
('&ETH', 208),
('&Ntilde', 209),
('&Ograve', 210),
('&Oacute', 211),
('&Ocirc', 212),
('&Otilde', 213),
('&Ouml', 214),
('&times', 215),
('&Oslash', 216),
('&Ugrave', 217),
('&Uacute', 218),
('&Ucirc', 219),
('&Uuml', 220),
('&Yacute', 221),
('&THORN', 222),
('&szlig', 223),
('&agrave', 224),
('&aacute', 225),
('&acirc', 226),
('&atilde', 227),
('&auml', 228),
('&aring', 229),
('&aelig', 230),
('&ccedil', 231),
('&egrave', 232),
('&eacute', 233),
('&ecirc', 234),
('&euml', 235),
('&igrave', 236),
('&iacute', 237),
('&icirc', 238),
('&iuml', 239),
('&eth', 240),
('&ntilde', 241),
('&ograve', 242),
('&oacute', 243),
('&ocirc', 244),
('&otilde', 245),
('&ouml', 246),
('&divide', 247),
('&oslash', 248),
('&ugrave', 249),
('&uacute', 250),
('&ucirc', 251),
('&uuml', 252),
('&yacute', 253),
('&thorn', 254),
('&yuml', 255),
('&OElig', 338),
('&oelig', 339),
('&Scaron', 352),
('&scaron', 353),
('&Yuml', 376),
('&fnof', 402),
('&circ', 710),
('&tilde', 732),
('&Alpha', 913),
('&Beta', 914),
('&Gamma', 915),
('&Delta', 916),
('&Epsilon', 917),
('&Zeta', 918),
('&Eta', 919),
('&Theta', 920),
('&Iota', 921),
('&Kappa', 922),
('&Lambda', 923),
('&Mu', 924),
('&Nu', 925),
('&Xi', 926),
('&Omicron', 927),
('&Pi', 928),
('&Rho', 929),
('&Sigma', 931),
('&Tau', 932),
('&Upsilon', 933),
('&Phi', 934),
('&Chi', 935),
('&Psi', 936),
('&Omega', 937),
('&alpha', 945),
('&beta', 946),
('&gamma', 947),
('&delta', 948),
('&epsilon', 949),
('&zeta', 950),
('&eta', 951),
('&theta', 952),
('&iota', 953),
('&kappa', 954),
('&lambda', 955),
('&mu', 956),
('&nu', 957),
('&xi', 958),
('&omicron', 959),
('&pi', 960),
('&rho', 961),
('&sigmaf', 962),
('&sigma', 963),
('&tau', 964),
('&upsilon', 965),
('&phi', 966),
('&chi', 967),
('&psi', 968),
('&omega', 969),
('&thetasym', 977),
('&upsih', 978),
('&piv', 982),
('&ensp', 8194),
('&emsp', 8195),
('&thinsp', 8201),
('&zwnj', 8204),
('&zwj', 8205),
('&lrm', 8206),
('&rlm', 8207),
('&ndash', 8211),
('&mdash', 8212),
('&lsquo', 8216),
('&rsquo', 8217),
('&sbquo', 8218),
('&ldquo', 8220),
('&rdquo', 8221),
('&bdquo', 8222),
('&dagger', 8224),
('&Dagger', 8225),
('&bull', 8226),
('&hellip', 8230),
('&permil', 8240),
('&prime', 8242),
('&Prime', 8243),
('&lsaquo', 8249),
('&rsaquo', 8250),
('&oline', 8254),
('&frasl', 8260),
('&euro', 8364),
('&image', 8465),
('&weierp', 8472),
('&real', 8476),
('&trade', 8482),
('&alefsym', 8501),
('&larr', 8592),
('&uarr', 8593),
('&rarr', 8594),
('&darr', 8595),
('&harr', 8596),
('&crarr', 8629),
('&lArr', 8656),
('&uArr', 8657),
('&rArr', 8658),
('&dArr', 8659),
('&hArr', 8660),
('&forall', 8704),
('&part', 8706),
('&exist', 8707),
('&empty', 8709),
('&nabla', 8711),
('&isin', 8712),
('&notin', 8713),
('&ni', 8715),
('&prod', 8719),
('&sum', 8721),
('&minus', 8722),
('&lowast', 8727),
('&radic', 8730),
('&prop', 8733),
('&infin', 8734),
('&ang', 8736),
('&and', 8743),
('&or', 8744),
('&cap', 8745),
('&cup', 8746),
('&int', 8747),
('&there4', 8756),
('&sim', 8764),
('&cong', 8773),
('&asymp', 8776),
('&ne', 8800),
('&equiv', 8801),
('&le', 8804),
('&ge', 8805),
('&sub', 8834),
('&sup', 8835),
('&nsub', 8836),
('&sube', 8838),
('&supe', 8839),
('&oplus', 8853),
('&otimes', 8855),
('&perp', 8869),
('&sdot', 8901),
('&lceil', 8968),
('&rceil', 8969),
('&lfloor', 8970),
('&rfloor', 8971),
('&lang', 9001),
('&rang', 9002),
('&loz', 9674),
('&spades', 9824),
('&clubs', 9827),
('&hearts', 9829),
('&diams', 9830)

-- Build numbers table to parse the string
DECLARE @numbers TABLE (number INT NOT NULL PRIMARY KEY) 
INSERT @numbers
SELECT TOP (LEN(@unsafe)) ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS number
FROM sys.all_objects s1 CROSS JOIN sys.all_objects s2

-- Use numbers table to parse each character.
-- If a match is found in character entity reference table,
-- then use the safe substitute. Otherwise, if the unicode
-- value is greater than 128, use &#<unicode char value>.
-- Finally, use the original character if nothing else
-- is a match
SELECT @safe = COALESCE(@safe,'') 
       + COALESCE(name, 
         CASE WHEN UNICODE(SUBSTRING(@unsafe, number, 1)) > 128 THEN '&#' 
          + CONVERT(NVARCHAR(10), UNICODE(SUBSTRING(@unsafe, number, 1))) 
          ELSE SUBSTRING(@unsafe, number, 1) END)
FROM @numbers 
LEFT OUTER JOIN @t  
  ON UNICODE(SUBSTRING(@unsafe, number, 1)) = unicode_val

SELECT @safe AS [safe]

Results:  
! &quot # $ % &amp &apos ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; 
&lt = &gt ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
[ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { 
| } ~   &#129 &#130 &#131 &#132 &#133 &#134 &#135 &#136 &#137 &#138 
&#139 &#140 &#141 &#142 &#143 &#144 &#145 &#146 &#147 &#148 &#149 
&#150 &#151 &#152 &#153 &#154 &#155 &#156 &#157 &#158 &#159 &nbsp 
&iexcl &cent &pound &curren &yen &brvbar &sect &uml &copy &ordf 
&laquo &not &shy &reg &macr &deg &plusmn &sup2 &sup3 &acute &micro 
&para &middot &cedil &sup1 &ordm &raquo &frac14 &frac12 &frac34 
&iquest &Agrave &Aacute &Acirc &Atilde &Auml &Aring &AElig &Ccedil 
&Egrave &Eacute &Ecirc &Euml &Igrave &Iacute &Icirc &Iuml &ETH &Ntilde 
&Ograve &Oacute &Ocirc &Otilde &Ouml &times &Oslash &Ugrave &Uacute 
&Ucirc &Uuml &Yacute &THORN &szlig &agrave &aacute &acirc &atilde &auml 
&aring &aelig &ccedil &egrave &eacute &ecirc &euml &igrave &iacute &icirc 
&iuml &eth &ntilde &ograve &oacute &ocirc &otilde &ouml &divide &oslash 
&ugrave &uacute &ucirc &uuml &yacute &thorn &yuml &#256 &#257 &#258 &#259 
&#260 &#261 &#262 &#263 &#264 &#265 &#266...