How to display a page in UTF-8, despite windows-1251 in the HTTP header

    I have an old site on Narod.ru , and recently I posted several articles there - as I do now in UTF-8. The encoding was indicated in the tag meta, but looking at the pages, I saw a crack: “ Что-то случилось.” It turns out that People.Ru sends an HTTP header Content-Type: text/html; charset=windows-1251and it doesn’t turn off on it. The user can get readable text - only if he guesses to manually switch the encoding in the browser.

    What to do? Switch to another hosting? Of course, but until the hands reached, I wanted to achieve a result here. Recode texts? It seemed more worthy and interesting to put Javascript-patch.

    I did not find a way to switch the encoding from Javascript. There was an option to recode the text with a script that is run on a onreadydocument event .

    So, the browser receives the text in UTF-8, splits the UTF sequences into groups of 8 bits and treats them as character codes encoded in Windows-1251. To restore the readability of the text, you need to get these codes, combine them in UTF-sequences, and from them - to restore Unicode-codes of characters and return the latter through numeric HTML links to characters. In this case, several quotes were discovered.



    First, reading the text from the property innerHTML, we find the HTML entity "  " in place of an inextricable space (0xA0) . You need to replace it back with 0xA0.

    Secondly, the function charCodeAtreturns the character code in Unicode, and not in Windows-1251, so you need to convert the first to the second. Thirdly, the character with the code0x98in Windows-1251 no, so this function returns for it undefined, this must be provided.

    Fourth, Internet Explorer and Safari do not allow you to change the title of a document through the DOM, only through the corresponding property of the document - but you cannot write HTML numeric links there. For this case, you can translate Unicode codes into a hexadecimal number system, write them in the form of "% code" and pass through the function unescape.

    The resulting code is as follows:

    bindReady(
    	function(){
    		var Win1251 =
    			{
    				0x0:	0x0,
    				0x1:	0x1,
    				0x2:	0x2,
    				0x3:	0x3,
    				0x4:	0x4,
    				0x5:	0x5,
    				0x6:	0x6,
    				0x7:	0x7,
    				0x8:	0x8,
    				0x9:	0x9,
    				0xA:	0xA,
    				0xB:	0xB,
    				0xC:	0xC,
    				0xD:	0xD,
    				0xE:	0xE,
    				0xF:	0xF,
    				0x10:	0x10,
    				0x11:	0x11,
    				0x12:	0x12,
    				0x13:	0x13,
    				0x14:	0x14,
    				0x15:	0x15,
    				0x16:	0x16,
    				0x17:	0x17,
    				0x18:	0x18,
    				0x19:	0x19,
    				0x1A:	0x1A,
    				0x1B:	0x1B,
    				0x1C:	0x1C,
    				0x1D:	0x1D,
    				0x1E:	0x1E,
    				0x1F:	0x1F,
    				0x20:	0x20,
    				0x21:	0x21,
    				0x22:	0x22,
    				0x23:	0x23,
    				0x24:	0x24,
    				0x25:	0x25,
    				0x26:	0x26,
    				0x27:	0x27,
    				0x28:	0x28,
    				0x29:	0x29,
    				0x2A:	0x2A,
    				0x2B:	0x2B,
    				0x2C:	0x2C,
    				0x2D:	0x2D,
    				0x2E:	0x2E,
    				0x2F:	0x2F,
    				0x30:	0x30,
    				0x31:	0x31,
    				0x32:	0x32,
    				0x33:	0x33,
    				0x34:	0x34,
    				0x35:	0x35,
    				0x36:	0x36,
    				0x37:	0x37,
    				0x38:	0x38,
    				0x39:	0x39,
    				0x3A:	0x3A,
    				0x3B:	0x3B,
    				0x3C:	0x3C,
    				0x3D:	0x3D,
    				0x3E:	0x3E,
    				0x3F:	0x3F,
    				0x40:	0x40,
    				0x41:	0x41,
    				0x42:	0x42,
    				0x43:	0x43,
    				0x44:	0x44,
    				0x45:	0x45,
    				0x46:	0x46,
    				0x47:	0x47,
    				0x48:	0x48,
    				0x49:	0x49,
    				0x4A:	0x4A,
    				0x4B:	0x4B,
    				0x4C:	0x4C,
    				0x4D:	0x4D,
    				0x4E:	0x4E,
    				0x4F:	0x4F,
    				0x50:	0x50,
    				0x51:	0x51,
    				0x52:	0x52,
    				0x53:	0x53,
    				0x54:	0x54,
    				0x55:	0x55,
    				0x56:	0x56,
    				0x57:	0x57,
    				0x58:	0x58,
    				0x59:	0x59,
    				0x5A:	0x5A,
    				0x5B:	0x5B,
    				0x5C:	0x5C,
    				0x5D:	0x5D,
    				0x5E:	0x5E,
    				0x5F:	0x5F,
    				0x60:	0x60,
    				0x61:	0x61,
    				0x62:	0x62,
    				0x63:	0x63,
    				0x64:	0x64,
    				0x65:	0x65,
    				0x66:	0x66,
    				0x67:	0x67,
    				0x68:	0x68,
    				0x69:	0x69,
    				0x6A:	0x6A,
    				0x6B:	0x6B,
    				0x6C:	0x6C,
    				0x6D:	0x6D,
    				0x6E:	0x6E,
    				0x6F:	0x6F,
    				0x70:	0x70,
    				0x71:	0x71,
    				0x72:	0x72,
    				0x73:	0x73,
    				0x74:	0x74,
    				0x75:	0x75,
    				0x76:	0x76,
    				0x77:	0x77,
    				0x78:	0x78,
    				0x79:	0x79,
    				0x7A:	0x7A,
    				0x7B:	0x7B,
    				0x7C:	0x7C,
    				0x7D:	0x7D,
    				0x7E:	0x7E,
    				0x7F:	0x7F,
    				0x402:	0x80,
    				0x403:	0x81,
    				0x201A:	0x82,
    				0x453:	0x83,
    				0x201E:	0x84,
    				0x2026:	0x85,
    				0x2020:	0x86,
    				0x2021:	0x87,
    				0x20AC:	0x88,
    				0x2030:	0x89,
    				0x409:	0x8A,
    				0x2039:	0x8B,
    				0x40A:	0x8C,
    				0x40C:	0x8D,
    				0x40B:	0x8E,
    				0x40F:	0x8F,
    				0x452:	0x90,
    				0x2018:	0x91,
    				0x2019:	0x92,
    				0x201C:	0x93,
    				0x201D:	0x94,
    				0x2022:	0x95,
    				0x2013:	0x96,
    				0x2014:	0x97,
    				0x2122:	0x99,
    				0x459:	0x9A,
    				0x203A:	0x9B,
    				0x45A:	0x9C,
    				0x45C:	0x9D,
    				0x45B:	0x9E,
    				0x45F:	0x9F,
    				0xA0:	0xA0,
    				0x40E:	0xA1,
    				0x45E:	0xA2,
    				0x408:	0xA3,
    				0xA4:	0xA4,
    				0x490:	0xA5,
    				0xA6:	0xA6,
    				0xA7:	0xA7,
    				0x401:	0xA8,
    				0xA9:	0xA9,
    				0x404:	0xAA,
    				0xAB:	0xAB,
    				0xAC:	0xAC,
    				0xAD:	0xAD,
    				0xAE:	0xAE,
    				0x407:	0xAF,
    				0xB0:	0xB0,
    				0xB1:	0xB1,
    				0x406:	0xB2,
    				0x456:	0xB3,
    				0x491:	0xB4,
    				0xB5:	0xB5,
    				0xB6:	0xB6,
    				0xB7:	0xB7,
    				0x451:	0xB8,
    				0x2116:	0xB9,
    				0x454:	0xBA,
    				0xBB:	0xBB,
    				0x458:	0xBC,
    				0x405:	0xBD,
    				0x455:	0xBE,
    				0x457:	0xBF,
    				0x410:	0xC0,
    				0x411:	0xC1,
    				0x412:	0xC2,
    				0x413:	0xC3,
    				0x414:	0xC4,
    				0x415:	0xC5,
    				0x416:	0xC6,
    				0x417:	0xC7,
    				0x418:	0xC8,
    				0x419:	0xC9,
    				0x41A:	0xCA,
    				0x41B:	0xCB,
    				0x41C:	0xCC,
    				0x41D:	0xCD,
    				0x41E:	0xCE,
    				0x41F:	0xCF,
    				0x420:	0xD0,
    				0x421:	0xD1,
    				0x422:	0xD2,
    				0x423:	0xD3,
    				0x424:	0xD4,
    				0x425:	0xD5,
    				0x426:	0xD6,
    				0x427:	0xD7,
    				0x428:	0xD8,
    				0x429:	0xD9,
    				0x42A:	0xDA,
    				0x42B:	0xDB,
    				0x42C:	0xDC,
    				0x42D:	0xDD,
    				0x42E:	0xDE,
    				0x42F:	0xDF,
    				0x430:	0xE0,
    				0x431:	0xE1,
    				0x432:	0xE2,
    				0x433:	0xE3,
    				0x434:	0xE4,
    				0x435:	0xE5,
    				0x436:	0xE6,
    				0x437:	0xE7,
    				0x438:	0xE8,
    				0x439:	0xE9,
    				0x43A:	0xEA,
    				0x43B:	0xEB,
    				0x43C:	0xEC,
    				0x43D:	0xED,
    				0x43E:	0xEE,
    				0x43F:	0xEF,
    				0x440:	0xF0,
    				0x441:	0xF1,
    				0x442:	0xF2,
    				0x443:	0xF3,
    				0x444:	0xF4,
    				0x445:	0xF5,
    				0x446:	0xF6,
    				0x447:	0xF7,
    				0x448:	0xF8,
    				0x449:	0xF9,
    				0x44A:	0xFA,
    				0x44B:	0xFB,
    				0x44C:	0xFC,
    				0x44D:	0xFD,
    				0x44E:	0xFE,
    				0x44F:	0xFF
    			}
    		String.prototype.Win1251_charCodeAt=function(char_num){
    			var char_code=this.charCodeAt(char_num);
    			return (char_code===undefined)?0x98:Win1251[char_code];
    		}
    		function utf8_decode(text){
    			text=text.replace(/ /g,"\u00A0");
    			var char_code, char_code2, char_code3, char_code4;
    			var result_str='';
    			for(var char_num=0; char_num=0xC0)
    					if(char_code<0xE0){
    						if(
    							(char_code2=text.Win1251_charCodeAt(++char_num))>=0x80 &&
    							char_code2<0xC0
    						)//110yyyyy 10zzzzzz - 00000000 00000000 00000yyy yyzzzzzz
    							result_str+="&#"+((char_code-0xC0)*0x40+(char_code2-0x80))+";";
    					}
    					else if(char_code<0xF0){
    						if(
    							(char_code2=text.Win1251_charCodeAt(++char_num))>=0x80 &&
    							char_code2<0xC0 &&
    							(char_code3=text.Win1251_charCodeAt(++char_num))>=0x80 &&
    							char_code3<0xC0
    						)//1110xxxx 10yyyyyy 10zzzzzz - 00000000 00000000 xxxxyyyy yyzzzzzz
    							result_str+="&#"+((char_code-0xE0)*0x1000+(char_code2-0x80)*0x40+(char_code3-0x80))+";";
    					}
    					else if(
    						char_code<0xF8 &&
    						(char_code2=text.Win1251_charCodeAt(++char_num))>=0x80 &&
    						char_code2<0xC0 &&
    						(char_code3=text.Win1251_charCodeAt(++char_num))>=0x80 &&
    						char_code3<0xC0 &&
    						(char_code4=text.Win1251_charCodeAt(++char_num))>=0x80 && char_code4<0xC0
    					)//11110www 10xxxxxx 10yyyyyy 10zzzzzz - 00000000 000wwwxx xxxxyyyy yyzzzzzz
    						result_str+="&#"+((char_code-0xF0)*0x40000+(char_code2-0x80)*0x1000+(char_code3-0x80)*0x40+(char_code4-0x80))+";";
    		    return result_str;
    		}
    		function unescapeTitle(title){
    			return unescape(
    				utf8_decode(document.title).replace(
    					/&#([0-9]+);/g,
    					function(expression, value){
    						if(isNaN(value=parseInt(value, 10)))
    							return NaN;
    						var i=0, retval="", radix=16;
    						while(i++<4 || value>0){
    							retval=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"][value%radix]+retval;	
    							value=Math.floor(value/radix);
    						}
    						return "%u"+retval;
    					}
    				)
    			);
    		}
    		document.body.innerHTML=utf8_decode(document.body.innerHTML);
    		if("vendor" in navigator && navigator.vendor.indexOf("Apple")>-1){
    			document.title=unescapeTitle(document.title);
    			return;
    		}
    		/*@cc_on
    		document.title=unescapeTitle(document.title);
    		return;
    		@*/
    		document.getElementsByTagName("title")[0].innerHTML=utf8_decode(document.title);
    	}
    );
    


    Tested in Firefox 3 and 4; Opera 9, 10 and 11; Internet Explorer 5.5, 6, 7, 8; Google Chrome and Safari latest release versions.

    Of course, this is a kunstyuk; I figured out what UTF-8 is on it. In a good way, the server should not harm its HTTP headers. But here the philosophical question arises: who better to know the encoding of the document - the server (the .htaccess author) or the HTML document itself (its author)? Perhaps browsers have a good reason to believe the server, not the metatag in the document.


    Also popular now: