/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ use std::borrow::ToOwned; use std::cmp::max; pub struct MIMEClassifier { image_classifier: GroupedClassifier, audio_video_classifer: GroupedClassifier, scriptable_classifier: GroupedClassifier, plaintext_classifier: GroupedClassifier, archive_classifer: GroupedClassifier, binary_or_plaintext: BinaryOrPlaintextClassifier, feeds_classifier: FeedsClassifier } impl MIMEClassifier { //Performs MIME Type Sniffing Algorithm (section 7) pub fn classify(&self, no_sniff: bool, check_for_apache_bug: bool, supplied_type: &Option<(String, String)>, data: &Vec) -> Option<(String, String)> { match *supplied_type{ None => { return self.sniff_unknown_type(!no_sniff, data); } Some((ref media_type, ref media_subtype)) => { match (&**media_type, &**media_subtype) { ("unknown", "unknown") | ("application", "unknown") | ("*", "*") => { return self.sniff_unknown_type(!no_sniff,data); } _ => { if no_sniff { return supplied_type.clone(); } if check_for_apache_bug { return self.sniff_text_or_data(data); } if MIMEClassifier::is_xml(media_type, media_subtype) { return supplied_type.clone(); } //Inplied in section 7.3, but flow is not clear if MIMEClassifier::is_html(media_type, media_subtype) { return self.feeds_classifier .classify(data) .or(supplied_type.clone()); } if &**media_type == "image" { if let Some(tp) = self.image_classifier.classify(data) { return Some(tp); } } match (&**media_type, &**media_subtype) { ("audio", _) | ("video", _) | ("application", "ogg") => { if let Some(tp) = self.audio_video_classifer.classify(data) { return Some(tp); } } _ => {} } } } } } return supplied_type.clone(); } pub fn new()->MIMEClassifier { MIMEClassifier{ image_classifier: GroupedClassifier::image_classifer(), audio_video_classifer: GroupedClassifier::audio_video_classifer(), scriptable_classifier: GroupedClassifier::scriptable_classifier(), plaintext_classifier: GroupedClassifier::plaintext_classifier(), archive_classifer: GroupedClassifier::archive_classifier(), binary_or_plaintext: BinaryOrPlaintextClassifier, feeds_classifier: FeedsClassifier } } //some sort of iterator over the classifiers might be better? fn sniff_unknown_type(&self, sniff_scriptable: bool, data: &Vec) -> Option<(String,String)> { if sniff_scriptable { self.scriptable_classifier.classify(data) } else { None }.or_else(|| self.plaintext_classifier.classify(data)) .or_else(|| self.image_classifier.classify(data)) .or_else(|| self.audio_video_classifer.classify(data)) .or_else(|| self.archive_classifer.classify(data)) .or_else(|| self.binary_or_plaintext.classify(data)) } fn sniff_text_or_data(&self, data: &Vec) -> Option<(String, String)> { self.binary_or_plaintext.classify(data) } fn is_xml(tp: &str, sub_tp: &str) -> bool { let suffix = &sub_tp[(max(sub_tp.len() as isize - "+xml".len() as isize, 0) as usize)..]; match (tp, sub_tp, suffix) { (_, _, "+xml") | ("application", "xml",_) | ("text", "xml",_) => {true} _ => {false} } } fn is_html(tp: &str, sub_tp: &str) -> bool { tp=="text" && sub_tp=="html" } } pub fn as_string_option(tup: Option<(&'static str, &'static str)>) -> Option<(String,String)> { tup.map(|(a, b)| (a.to_owned(), b.to_owned())) } //Interface used for composite types trait MIMEChecker { fn classify(&self, data: &Vec)->Option<(String, String)>; } trait Matches { fn matches(&mut self, matches: &[u8])->bool; } impl <'a, T: Iterator + Clone> Matches for T { // Matching function that works on an iterator. // see if the next matches.len() bytes in data_iterator equal matches // move iterator and return true or just return false // // Params // self: an iterator // matches: a vector of bytes to match // // Return // true if the next n elements of self match n elements of matches // false otherwise // // Side effects // moves the iterator when match is found fn matches(&mut self, matches: &[u8]) -> bool { for (byte_a, byte_b) in self.clone().take(matches.len()).zip(matches.iter()) { if byte_a != byte_b { return false; } } self.nth(matches.len()); true } } struct ByteMatcher { pattern: &'static [u8], mask: &'static [u8], leading_ignore: &'static [u8], content_type: (&'static str,&'static str) } impl ByteMatcher { fn matches(&self, data: &Vec) -> Option { if data.len() < self.pattern.len() { return None; } //TODO replace with iterators if I ever figure them out... let mut i: usize = 0; let max_i = data.len()-self.pattern.len(); loop { if !self.leading_ignore.iter().any(|x| *x == data[i]) { break; } i = i + 1; if i > max_i { return None; } } for j in 0..self.pattern.len() { if (data[i] & self.mask[j]) != (self.pattern[j] & self.mask[j]) { return None; } i = i + 1; } Some(i) } } impl MIMEChecker for ByteMatcher { fn classify(&self, data: &Vec) -> Option<(String, String)> { self.matches(data).map(|_| { (self.content_type.0.to_owned(), self.content_type.1.to_owned()) }) } } struct TagTerminatedByteMatcher { matcher: ByteMatcher } impl MIMEChecker for TagTerminatedByteMatcher { fn classify(&self, data: &Vec) -> Option<(String, String)> { let pattern = self.matcher.matches(data); let pattern_matches = pattern.map(|j| j < data.len() && (data[j] == b' ' || data[j] == b'>')); if pattern_matches.unwrap_or(false) { Some((self.matcher.content_type.0.to_owned(), self.matcher.content_type.1.to_owned())) } else { None } } } pub struct Mp4Matcher; impl Mp4Matcher { pub fn matches(&self,data: &Vec) -> bool { if data.len() < 12 { return false; } let box_size = ((data[0] as u32) << 3 | (data[1] as u32) << 2 | (data[2] as u32) << 1 | (data[3] as u32)) as usize; if (data.len() < box_size) || (box_size % 4 != 0) { return false; } //TODO replace with iterators let ftyp = [0x66, 0x74, 0x79, 0x70]; let mp4 = [0x6D, 0x70, 0x34]; for i in 4..8 { if data[i] != ftyp[i - 4] { return false; } } let mut all_match = true; for i in 8..11 { if data[i]!=mp4[i - 8] { all_match = false; break; } } if all_match { return true; } let mut bytes_read: usize = 16; while bytes_read < box_size { all_match = true; for i in 0..3 { if mp4[i] != data[i + bytes_read] { all_match = false; break; } } if all_match { return true; } bytes_read = bytes_read + 4; } false } } impl MIMEChecker for Mp4Matcher { fn classify(&self, data: &Vec) -> Option<(String, String)> { if self.matches(data) { Some(("video".to_owned(), "mp4".to_owned())) } else { None } } } struct BinaryOrPlaintextClassifier; impl BinaryOrPlaintextClassifier { fn classify_impl(&self, data: &Vec) -> Option<(&'static str, &'static str)> { if (data.len() >=2 && ((data[0] == 0xFFu8 && data[1] == 0xFEu8) || (data[0] == 0xFEu8 && data[1] == 0xFFu8))) || (data.len() >= 3 && data[0] == 0xEFu8 && data[1] == 0xBBu8 && data[2] == 0xBFu8) { Some(("text", "plain")) } else if data.len() >= 1 && data.iter().any(|&x| x <= 0x08u8 || x == 0x0Bu8 || (x >= 0x0Eu8 && x <= 0x1Au8) || (x >= 0x1Cu8 && x <= 0x1Fu8)) { Some(("application", "octet-stream")) } else { Some(("text", "plain")) } } } impl MIMEChecker for BinaryOrPlaintextClassifier { fn classify(&self, data: &Vec) -> Option<(String, String)> { return as_string_option(self.classify_impl(data)); } } struct GroupedClassifier { byte_matchers: Vec>, } impl GroupedClassifier { fn image_classifer() -> GroupedClassifier { GroupedClassifier { byte_matchers: vec![ box ByteMatcher::image_x_icon(), box ByteMatcher::image_x_icon_cursor(), box ByteMatcher::image_bmp(), box ByteMatcher::image_gif89a(), box ByteMatcher::image_gif87a(), box ByteMatcher::image_webp(), box ByteMatcher::image_png(), box ByteMatcher::image_jpeg(), ] } } fn audio_video_classifer() -> GroupedClassifier { GroupedClassifier{ byte_matchers: vec![ box ByteMatcher::video_webm(), box ByteMatcher::audio_basic(), box ByteMatcher::audio_aiff(), box ByteMatcher::audio_mpeg(), box ByteMatcher::application_ogg(), box ByteMatcher::audio_midi(), box ByteMatcher::video_avi(), box ByteMatcher::audio_wave(), box Mp4Matcher ] } } fn scriptable_classifier() -> GroupedClassifier { GroupedClassifier{ byte_matchers: vec![ box ByteMatcher::text_html_doctype(), box ByteMatcher::text_html_page(), box ByteMatcher::text_html_head(), box ByteMatcher::text_html_script(), box ByteMatcher::text_html_iframe(), box ByteMatcher::text_html_h1(), box ByteMatcher::text_html_div(), box ByteMatcher::text_html_font(), box ByteMatcher::text_html_table(), box ByteMatcher::text_html_a(), box ByteMatcher::text_html_style(), box ByteMatcher::text_html_title(), box ByteMatcher::text_html_b(), box ByteMatcher::text_html_body(), box ByteMatcher::text_html_br(), box ByteMatcher::text_html_p(), box ByteMatcher::text_html_comment(), box ByteMatcher::text_xml(), box ByteMatcher::application_pdf() ] } } fn plaintext_classifier() -> GroupedClassifier { GroupedClassifier{ byte_matchers: vec![ box ByteMatcher::text_plain_utf_8_bom(), box ByteMatcher::text_plain_utf_16le_bom(), box ByteMatcher::text_plain_utf_16be_bom(), box ByteMatcher::application_postscript() ] } } fn archive_classifier() -> GroupedClassifier { GroupedClassifier { byte_matchers: vec![ box ByteMatcher::application_x_gzip(), box ByteMatcher::application_zip(), box ByteMatcher::application_x_rar_compressed() ] } } // TODO: Use this in font context classifier #[allow(dead_code)] fn font_classifier() -> GroupedClassifier { GroupedClassifier { byte_matchers: vec![ box ByteMatcher::application_font_woff(), box ByteMatcher::true_type_collection(), box ByteMatcher::open_type(), box ByteMatcher::true_type(), box ByteMatcher::application_vnd_ms_font_object(), ] } } } impl MIMEChecker for GroupedClassifier { fn classify(&self,data: &Vec) -> Option<(String, String)> { self.byte_matchers .iter() .filter_map(|matcher| matcher.classify(data)) .next() } } struct FeedsClassifier; impl FeedsClassifier { fn classify_impl(&self,data: &Vec) -> Option<(&'static str,&'static str)> { let length = data.len(); let mut data_iterator = data.iter(); // acceptable byte sequences let utf8_bom = &[0xEFu8, 0xBBu8, 0xBFu8]; // can not be feed unless length is > 3 if length < 3 { return None; } // eat the first three bytes if they are equal to UTF-8 BOM data_iterator.matches(utf8_bom); // continuously search for next "<" until end of data_iterator // TODO: need max_bytes to prevent inadvertently examining html document // eg. an html page with a feed example while !data_iterator.find(|&data_iterator| *data_iterator == b'<').is_none() { if data_iterator.matches(b"?") { // eat until ?> while !data_iterator.matches(b"?>") { if data_iterator.next().is_none() { return None; } } } else if data_iterator.matches(b"!--") { // eat until --> while !data_iterator.matches(b"-->") { if data_iterator.next().is_none() { return None; } } } else if data_iterator.matches(b"!") { data_iterator.find(|&data_iterator| *data_iterator == b'>'); } else if data_iterator.matches(b"rss") { return Some(("application", "rss+xml")); } else if data_iterator.matches(b"feed") { return Some(("application", "atom+xml")); } else if data_iterator.matches(b"rdf: RDF") { while !data_iterator.next().is_none() { if data_iterator.matches(b"http: //purl.org/rss/1.0/") { while !data_iterator.next().is_none() { if data_iterator.matches(b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#") { return Some(("application", "rss+xml")); } } } else if data_iterator.matches(b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#") { while !data_iterator.next().is_none() { if data_iterator.matches(b"http: //purl.org/rss/1.0/") { return Some(("application", "rss+xml")); } } } } } } None } } impl MIMEChecker for FeedsClassifier { fn classify(&self,data: &Vec) -> Option<(String, String)> { as_string_option(self.classify_impl(data)) } } //Contains hard coded byte matchers //TODO: These should be configured and not hard coded impl ByteMatcher { //A Windows Icon signature fn image_x_icon()->ByteMatcher { ByteMatcher{ pattern: b"\x00\x00\x01\x00", mask: b"\xFF\xFF\xFF\xFF", content_type: ("image", "x-icon"), leading_ignore: &[] } } //A Windows Cursor signature. fn image_x_icon_cursor()->ByteMatcher { ByteMatcher{ pattern: b"\x00\x00\x02\x00", mask: b"\xFF\xFF\xFF\xFF", content_type: ("image", "x-icon"), leading_ignore: &[] } } //The string "BM", a BMP signature. fn image_bmp()->ByteMatcher { ByteMatcher{ pattern: b"BM", mask: b"\xFF\xFF", content_type: ("image", "bmp"), leading_ignore: &[] } } //The string "GIF89a", a GIF signature. fn image_gif89a()->ByteMatcher { ByteMatcher{ pattern: b"GIF89a", mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", content_type: ("image", "gif"), leading_ignore: &[] } } //The string "GIF87a", a GIF signature. fn image_gif87a()->ByteMatcher { ByteMatcher{ pattern: b"GIF87a", mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", content_type: ("image", "gif"), leading_ignore: &[] } } //The string "RIFF" followed by four bytes followed by the string "WEBPVP". fn image_webp()->ByteMatcher { ByteMatcher{ pattern: b"RIFF\x00\x00\x00\x00WEBPVP", mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00,\xFF\xFF\xFF\xFF\xFF\xFF", content_type: ("image", "webp"), leading_ignore: &[] } } //An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG //signature. fn image_png()->ByteMatcher { ByteMatcher{ pattern: b"\x89PNG\r\n\x1A\n", mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", content_type: ("image", "png"), leading_ignore: &[] } } // The JPEG Start of Image marker followed by the indicator byte of another marker. fn image_jpeg()->ByteMatcher { ByteMatcher{ pattern: b"\xFF\xD8\xFF", mask: b"\xFF\xFF\xFF", content_type: ("image", "jpeg"), leading_ignore: &[] } } //The WebM signature. [TODO: Use more bytes?] fn video_webm()->ByteMatcher { ByteMatcher{ pattern: b"\x1A\x45\xDF\xA3", mask: b"\xFF\xFF\xFF\xFF", content_type: ("video", "webm"), leading_ignore: &[] } } //The string ".snd", the basic audio signature. fn audio_basic()->ByteMatcher { ByteMatcher{ pattern: b".snd", mask: b"\xFF\xFF\xFF\xFF", content_type: ("audio", "basic"), leading_ignore: &[] } } //The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature. fn audio_aiff()->ByteMatcher { ByteMatcher{ pattern: b"FORM\x00\x00\x00\x00AIFF", mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", content_type: ("audio", "aiff"), leading_ignore: &[] } } //The string "ID3", the ID3v2-tagged MP3 signature. fn audio_mpeg()->ByteMatcher { ByteMatcher{ pattern: b"ID3", mask: b"\xFF\xFF\xFF", content_type: ("audio", "mpeg"), leading_ignore: &[] } } //The string "OggS" followed by NUL, the Ogg container signature. fn application_ogg()->ByteMatcher { ByteMatcher{ pattern: b"OggS", mask: b"\xFF\xFF\xFF\xFF\xFF", content_type: ("application", "ogg"), leading_ignore: &[] } } //The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian), //the MIDI signature. fn audio_midi()->ByteMatcher { ByteMatcher{ pattern: b"MThd\x00\x00\x00\x06", mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", content_type: ("audio", "midi"), leading_ignore: &[] } } //The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature. fn video_avi()->ByteMatcher { ByteMatcher{ pattern: b"RIFF\x00\x00\x00\x00AVI ", mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", content_type: ("video", "avi"), leading_ignore: &[] } } // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature. fn audio_wave()->ByteMatcher { ByteMatcher{ pattern: b"RIFF\x00\x00\x00\x00WAVE", mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", content_type: ("audio", "wave"), leading_ignore: &[] } } // doctype terminated with Tag terminating (TT) Byte fn text_html_doctype()->TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher { pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"TagTerminatedByteMatcher { TagTerminatedByteMatcher { matcher: ByteMatcher{ pattern: b"