Skip to content

Commit 56eeddd

Browse files
committed
Add langdetect to the list of benchmark libraries
1 parent 42ffc1a commit 56eeddd

File tree

9 files changed

+51
-12
lines changed

9 files changed

+51
-12
lines changed

‎docs/benchmark.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
| -------------- | --------------------------- | ------------------- | --------------------- | -------------- | ------------------ | --------- |
1717
| **TinyLD** | `yarn bench:tinyld` | 97.7311% | 1.9247% | 0.3441% | 0.0966ms. | 930KB |
1818
| **TinyLD Web** | `yarn bench:tinyld-light` | 97.4512% | 2.1131% | 0.4358% | 0.0802ms. | **110KB** |
19-
| **node-cld** | `yarn bench:cld` | 88.9148% | 1.7489% | 9.3363% | 0.0477ms. | > 10MB |
19+
| **langdetect** | `yarn bench:langdetect` | 94.879% | 5.1093% | 0.0117% | 0.4631ms. | 1.8MB |
20+
| node-cld | `yarn bench:cld` | 88.9148% | 1.7489% | 9.3363% | 0.0477ms. | > 10MB |
2021
| node-lingua | `yarn bench:lingua` | 86.3093% | 0.13% | 13.5607% | 1.5695ms. | ~100MB |
2122
| franc | `yarn bench:franc` | 68.7783% | 26.3432% | 4.8785% | 0.1626ms. | 267KB |
22-
| franc-all | `yarn bench:franc-all` | 61.7893% | 33.3322% | 4.8785% | 0.3688ms. | 509KB |
2323
| franc-min | `yarn bench:franc-min` | 65.5163% | 23.5794% | 10.9044% | 0.0734ms. | **119KB** |
24+
| franc-all | `yarn bench:franc-all` | 61.7893% | 33.3322% | 4.8785% | 0.3688ms. | 509KB |
2425
| languagedetect | `yarn bench:languagedetect` | 61.6068% | 12.295% | 26.0982% | 0.212ms. | **240KB** |
2526

2627
which gives us the following graph
@@ -35,11 +36,11 @@ Let's now compare those libraries per language
3536

3637
#### Recommended
3738

38-
- For **NodeJS**: `TinyLD` or `node-cld` (fast and accurate)
39+
- For **NodeJS**: `TinyLD`, `langdetect` or `node-cld` (fast and accurate)
3940
- For **Browser**: `TinyLD Light` or `franc-min` (small, decent accuracy, franc is less accurate but support more languages)
4041

4142
#### Not recommended
4243

4344
- `node-lingua` has a quite good accuracy but is just too big and slow
44-
- `franc-all` is the worse in term of accuracy, not a surprise because it tries to detect 400+ languages. A technical demo to put big numbers but useless for real usage, even a language like english is barely at ~45% detection rate.
45+
- `franc-all` is the worse in term of accuracy, not a surprise because it tries to detect 400+ languages with only 3-grams. A technical demo to put big numbers but useless for real usage, even a language like english barely reach ~45% detection rate.
4546
- `languagedetect` is light but just not accurate enough, really focused on indo-european languages (support kazakh but not chinese, korean or japanese). Interesting fact, it's more accurate than franc on west european languages.

‎docs/language.svg

+1-1
Loading

‎docs/overall.svg

+1-1
Loading

‎package.json

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"bench:franc": "ts-node src/benchmark/franc.ts",
4747
"bench:franc-all": "ts-node src/benchmark/franc-all.ts",
4848
"bench:franc-min": "ts-node src/benchmark/franc-min.ts",
49+
"bench:langdetect": "ts-node src/benchmark/langdetect.ts",
4950
"bench:languagedetect": "ts-node src/benchmark/languagedetect.ts",
5051
"bench:lingua": "cross-env TINYLD_CONFIG=normal ts-node src/benchmark/lingua.ts",
5152
"bench:tinyld": "cross-env TINYLD_CONFIG=normal ts-node src/benchmark/tinyld.ts",
@@ -78,6 +79,7 @@
7879
"franc": "^5.0.0",
7980
"franc-all": "^5.0.0",
8081
"franc-min": "^5.0.0",
82+
"langdetect": "^0.2.1",
8183
"languagedetect": "^2.0.0",
8284
"lingua-node": "^0.3.0",
8385
"npm-run-all": "^4.1.5",

‎src/benchmark/langdetect.ts

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import { benchmark } from './bench'
2+
import fs from 'fs'
3+
4+
// eslint-disable-next-line @typescript-eslint/no-var-requires
5+
const { detect } = require('langdetect')
6+
7+
function langdetect(val: string): string {
8+
const res = detect(val)
9+
if (res && res.length > 0) {
10+
const lang = res[0].lang || ''
11+
if (['zh-cn', 'zh-tw'].includes(lang)) return 'zh'
12+
return lang
13+
}
14+
return ''
15+
}
16+
17+
;(async () => {
18+
const res = await benchmark(langdetect)
19+
if (!fs.existsSync('./data/bench')) fs.mkdirSync('./data/bench')
20+
fs.writeFileSync('./data/bench/langdetect.json', JSON.stringify(res, null, 2))
21+
})()

‎utils/index.js

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ function getJSON(filepath) {
1010
const data = {
1111
tinyld: getJSON('./data/bench/tinyld.json'),
1212
'tinyld-light': getJSON('./data/bench/tinyld-light.json'),
13+
langdetect: getJSON('./data/bench/langdetect.json'),
1314
cld: getJSON('./data/bench/cld.json'),
1415
lingua: getJSON('./data/bench/lingua.json'),
1516
franc: getJSON('./data/bench/franc.json'),

‎utils/language.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ const chartistSvg = require('chartist-svg')
33
module.exports = (data, langs) => {
44
var graph = {
55
title: 'NodeJS Language Detection - Per Language',
6-
subtitle: 'Tinyld vs Cld vs Lingua vs Franc vs Languagedetect',
6+
subtitle: 'Tinyld vs Langdetect vs Cld vs Lingua vs Franc',
77
labels: langs.map((x) => x.toUpperCase()),
8-
series: ['tinyld', 'cld', 'lingua', 'franc', 'languagedetect'].map((lib) => {
8+
series: ['tinyld', 'langdetect', 'cld', 'lingua', 'franc'].map((lib) => {
99
return langs.map((lang) => {
1010
return data[lib].languages[lang]
1111
})

‎utils/mkdown.js

+6-4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ async function generateDocBenchmark() {
3535
const data = {
3636
tinyld: getJSON('./data/bench/tinyld.json'),
3737
'tinyld-light': getJSON('./data/bench/tinyld-light.json'),
38+
langdetect: getJSON('./data/bench/langdetect.json'),
3839
cld: getJSON('./data/bench/cld.json'),
3940
lingua: getJSON('./data/bench/lingua.json'),
4041
franc: getJSON('./data/bench/franc.json'),
@@ -63,11 +64,12 @@ async function generateDocBenchmark() {
6364
| -------------- | --------------------------- | ------------------- | --------------------- | -------------- | ------------------ | --------- |
6465
| **TinyLD** | \`yarn bench:tinyld\` | ${stats('tinyld')} | 930KB |
6566
| **TinyLD Web** | \`yarn bench:tinyld-light\` | ${stats('tinyld-light')} | **110KB** |
66-
| **node-cld** | \`yarn bench:cld\` | ${stats('cld')} | > 10MB |
67+
| **langdetect** | \`yarn bench:langdetect\` | ${stats('langdetect')} | 1.8MB |
68+
| node-cld | \`yarn bench:cld\` | ${stats('cld')} | > 10MB |
6769
| node-lingua | \`yarn bench:lingua\` | ${stats('lingua')} | ~100MB |
6870
| franc | \`yarn bench:franc\` | ${stats('franc')} | 267KB |
69-
| franc-all | \`yarn bench:franc-all\` | ${stats('franc-all')} | 509KB |
7071
| franc-min | \`yarn bench:franc-min\` | ${stats('franc-min')} | **119KB** |
72+
| franc-all | \`yarn bench:franc-all\` | ${stats('franc-all')} | 509KB |
7173
| languagedetect | \`yarn bench:languagedetect\` | ${stats('languagedetect')} | **240KB** |
7274
7375
which gives us the following graph
@@ -81,13 +83,13 @@ Let's now compare those libraries per language
8183
8284
#### Recommended
8385
84-
- For **NodeJS**: \`TinyLD\` or \`node-cld\` (fast and accurate)
86+
- For **NodeJS**: \`TinyLD\`, \`langdetect\` or \`node-cld\` (fast and accurate)
8587
- For **Browser**: \`TinyLD Light\` or \`franc-min\` (small, decent accuracy, franc is less accurate but support more languages)
8688
8789
#### Not recommended
8890
8991
- \`node-lingua\` has a quite good accuracy but is just too big and slow
90-
- \`franc-all\` is the worse in term of accuracy, not a surprise because it tries to detect 400+ languages. A technical demo to put big numbers but useless for real usage, even a language like english barely reach ~45% detection rate.
92+
- \`franc-all\` is the worse in term of accuracy, not a surprise because it tries to detect 400+ languages with only 3-grams. A technical demo to put big numbers but useless for real usage, even a language like english barely reach ~45% detection rate.
9193
- \`languagedetect\` is light but just not accurate enough, really focused on indo-european languages (support kazakh but not chinese, korean or japanese). Interesting fact, it's more accurate than franc on west european languages.
9294
`
9395
)

‎yarn.lock

+12
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,13 @@ kleur@^4.0.3:
16791679
resolved "https://registry.yarnpkg.com/kleur/-/kleur-4.1.4.tgz#8c202987d7e577766d039a8cd461934c01cda04d"
16801680
integrity sha512-8QADVssbrFjivHWQU7KkMgptGTl6WAcSdlbBPY4uNF+mWr6DGcKrvY2w4FQJoXch7+fKMjj0dRrL75vk3k23OA==
16811681

1682+
langdetect@^0.2.1:
1683+
version "0.2.1"
1684+
resolved "https://registry.yarnpkg.com/langdetect/-/langdetect-0.2.1.tgz#d7ca8339497a4dd65c4df935d46e344bfea86d01"
1685+
integrity sha1-18qDOUl6TdZcTfk11G40S/6obQE=
1686+
dependencies:
1687+
unicode-9.0.0 "0.7.0"
1688+
16821689
languagedetect@^2.0.0:
16831690
version "2.0.0"
16841691
resolved "https://registry.yarnpkg.com/languagedetect/-/languagedetect-2.0.0.tgz#4b8fa2b7593b2a3a02fb1100891041c53238936c"
@@ -2947,6 +2954,11 @@ underscore@^1.12.1:
29472954
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.1.tgz#0c1c6bd2df54b6b69f2314066d65b6cde6fcf9d1"
29482955
integrity sha512-hzSoAVtJF+3ZtiFX0VgfFPHEDRm7Y/QPjGyNo4TVdnDTdft3tr8hEkD25a1jC+TjTuE7tkHGKkhwCgs9dgBB2g==
29492956

2957+
unicode-9.0.0@0.7.0:
2958+
version "0.7.0"
2959+
resolved "https://registry.yarnpkg.com/unicode-9.0.0/-/unicode-9.0.0-0.7.0.tgz#4d303caf9c6dbf2b979943c18a384e226643b937"
2960+
integrity sha1-TTA8r5xtvyuXmUPBijhOImZDuTc=
2961+
29502962
unicode-properties@^1.2.2:
29512963
version "1.3.1"
29522964
resolved "https://registry.yarnpkg.com/unicode-properties/-/unicode-properties-1.3.1.tgz#cc642b6314bde2c691d65dd94cece09ed84f1282"

0 commit comments

Comments
 (0)