1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
//! Interfaces to the detector itself.

use std::sync::Mutex;
use std::ffi::{CString, CStr};
use std::str::from_utf8;
use std::default::Default;
use std::ptr::{null, null_mut};

use libc::{c_int, c_double, c_char};
use ffi::{CLDHints, Encoding, CLD2_ExtDetectLanguageSummary4,
          CLD2_DetectLanguageVersion};
use ffi::Language as LanguageId;

use language::LanguageIdExt;
use types::*;

// A lock which controls our access to DetectLanguageVersion, which uses
// an internal buffer to format the string.
//
// TODO: Should we move this to the cld2-sys package, in case other
// packages want to synchronize with us?
//
// HACK: Yes, this is Mutex inside lazy_static, which is just plain wrong.
// I just want to get this working on stable Rust with as little fuss as
// possible.
lazy_static! {
    static ref CLD2_VERSION_LOCK: Mutex<u8> = Mutex::new(0);
}

/// Get the version of cld2 and its embedded data files as a string.
///
/// ```
/// use cld2::detector_version;
/// format!("cld2 version: {}", detector_version());
/// ```
pub fn detector_version() -> String {
    unsafe {
        let guard = CLD2_VERSION_LOCK.lock();
        let version_string = CLD2_DetectLanguageVersion();
        assert!(!version_string.is_null());
        let bytes = CStr::from_ptr(version_string).to_bytes();
        let result = from_utf8(bytes).unwrap().to_string();
        drop(guard);
        result
    }
}

/// Detect the language of the input text.
///
/// ```
/// use cld2::{detect_language, Format, Reliable, Unreliable, Lang};
///
/// let text = "It is an ancient Mariner,
/// And he stoppeth one of three.
/// 'By thy long grey beard and glittering eye,
/// Now wherefore stopp'st thou me?";
///
/// assert_eq!((Some(Lang("en")), Reliable),
///            detect_language(text, Format::Text));
///
/// assert_eq!((None, Unreliable),
///            detect_language("blah", Format::Html));
/// ```
pub fn detect_language(text: &str, format: Format) ->
    (Option<Lang>, Reliability)
{
    let result = detect_language_ext(text, format, &Default::default());
    (result.language, result.reliability)
}

/// Detect the language of the input text, using optional hints, and return
/// detailed statistics.
///
/// ```
/// use std::default::Default;
/// use cld2::{detect_language_ext, Format, Lang};
///
/// let text = "Sur le pont d'Avignon,
/// L'on y danse, l'on y danse,
/// Sur le pont d'Avignon
/// L'on y danse tous en rond.
///
/// Les belles dames font comme ça
/// Et puis encore comme ça.
/// Les messieurs font comme ça
/// Et puis encore comme ça.";
///
/// let detected =
///   detect_language_ext(text, Format::Text, &Default::default());
/// 
/// assert_eq!(Some(Lang("fr")), detected.language);
/// ```
pub fn detect_language_ext(text: &str, format: Format, hints: &Hints)
    -> DetectionResult
{
    let mut language3 = [LanguageId::UNKNOWN_LANGUAGE,
                         LanguageId::UNKNOWN_LANGUAGE,
                         LanguageId::UNKNOWN_LANGUAGE];
    let mut percent3: [c_int; 3] = [0, 0, 0];
    let mut normalized_score3: [c_double; 3] = [0.0, 0.0, 0.0];
    let mut text_bytes: c_int = 0;
    let mut is_reliable: bool = false;

    unsafe {
        hints.with_c_rep(|hints_ptr| {
            let lang = CLD2_ExtDetectLanguageSummary4(
                text.as_ptr() as *const i8, text.len() as c_int,
                format == Format::Text, hints_ptr, 0,
                language3.as_mut_ptr(),
                percent3.as_mut_ptr(),
                normalized_score3.as_mut_ptr(),
                null_mut(), &mut text_bytes, &mut is_reliable);
            from_ffi(lang, &language3, &percent3, &normalized_score3,
                     text_bytes, is_reliable)
        })
    }
}

fn to_c_str_or_null(s: Option<&str>) -> *const c_char {
    let opt_c_str = s.map(|v| CString::new(v.as_bytes()).unwrap());
    opt_c_str.map(|v| v.as_ptr()).unwrap_or(null())
}

/// A value which can be converted to type `R` for use with the FFI.
trait WithCRep<R> {
    /// Call the function `body` with a C-compatible represention of type
    /// `R`.
    fn with_c_rep<T, F: FnOnce(R) -> T>(&self, body: F) -> T;
}

impl<'a> WithCRep<*const CLDHints> for Hints<'a> {
    fn with_c_rep<T, F: FnOnce(*const CLDHints) -> T>(&self, body: F) -> T {
        let clang_ptr = to_c_str_or_null(self.content_language);
        let tld_ptr = to_c_str_or_null(self.tld);
        let lang = self.language
            .map(|Lang(c)| LanguageIdExt::from_name(c))
            .unwrap_or(LanguageId::UNKNOWN_LANGUAGE);
        let encoding = self.encoding
            .unwrap_or(Encoding::UNKNOWN_ENCODING) as c_int;
        let hints =
            CLDHints{content_language_hint: clang_ptr, tld_hint: tld_ptr,
                     encoding_hint: encoding, language_hint: lang};
        body(&hints)
    }
}

fn from_ffi(lang: LanguageId, language3: &[LanguageId; 3],
            percent3: &[c_int; 3], normalized_score3: &[c_double; 3],
            text_bytes: c_int, reliable: bool) -> DetectionResult
{
    let score_n = |n: usize| {
        LanguageScore{language: language3[n].to_lang(),
                      percent: percent3[n] as u8,
                      normalized_score: normalized_score3[n]}
    };

    DetectionResult::new(lang.to_lang(),
                         [score_n(0), score_n(1), score_n(2)],
                         text_bytes,
                         Reliability::from_bool(reliable))
}