2020-04-03 23:54:48 +00:00
|
|
|
import sniffHTMLEncoding = require('html-encoding-sniffer');
|
|
|
|
|
import whatwgEncoding = require('whatwg-encoding');
|
|
|
|
|
import MIMEType = require('whatwg-mimetype');
|
|
|
|
|
|
|
|
|
|
// https://github.com/jsdom/jsdom/blob/59fa79518da02dc2f098e989cfae3bdb24449f66/lib/api.js#L290-L310
|
|
|
|
|
function normalizeHTML(
|
2020-05-15 02:20:31 +00:00
|
|
|
html: string | ArrayBufferView | ArrayBuffer | Buffer = '',
|
|
|
|
|
mimeType: MIMEType,
|
2020-04-03 23:54:48 +00:00
|
|
|
): {
|
2020-05-15 02:20:31 +00:00
|
|
|
html: string;
|
|
|
|
|
encoding: string;
|
2020-04-03 23:54:48 +00:00
|
|
|
} {
|
2020-05-15 02:20:31 +00:00
|
|
|
let encoding = 'UTF-8';
|
2020-04-03 23:54:48 +00:00
|
|
|
|
2020-05-15 02:20:31 +00:00
|
|
|
if (ArrayBuffer.isView(html)) {
|
|
|
|
|
html = Buffer.from(html.buffer, html.byteOffset, html.byteLength);
|
|
|
|
|
} else if (html instanceof ArrayBuffer) {
|
|
|
|
|
html = Buffer.from(html);
|
|
|
|
|
}
|
2020-04-03 23:54:48 +00:00
|
|
|
|
2020-05-15 02:20:31 +00:00
|
|
|
if (Buffer.isBuffer(html)) {
|
|
|
|
|
encoding = sniffHTMLEncoding(html, {
|
|
|
|
|
defaultEncoding: mimeType.isXML() ? 'UTF-8' : 'windows-1252',
|
|
|
|
|
transportLayerEncodingLabel: mimeType.parameters.get('charset'),
|
|
|
|
|
});
|
|
|
|
|
html = whatwgEncoding.decode(html, encoding);
|
|
|
|
|
} else {
|
|
|
|
|
html = String(html);
|
|
|
|
|
}
|
2020-04-03 23:54:48 +00:00
|
|
|
|
2020-05-15 02:20:31 +00:00
|
|
|
return { html, encoding };
|
2020-04-03 23:54:48 +00:00
|
|
|
}
|