"结巴"中文分词的Golang版本

GoJieba English

Build Status Author Donate Tag Performance License GoDoc Coverage Status codebeat badge Go Report Card Awesome

logo

GoJieba是"结巴"中文分词的Golang语言版本。

简介

  • 支持多种分词方式,包括: 最大概率模式, HMM新词发现模式, 搜索引擎模式, 全模式
  • 核心算法底层由C++实现,性能高效。
  • 字典路径可配置,NewJieba(...string), NewExtractor(...string) 可变形参,当参数为空时使用默认词典(推荐方式)

用法

go get github.com/yanyiwu/gojieba

分词示例

package main

import (
	"fmt"
	"strings"

	"github.com/yanyiwu/gojieba"
)

func main() {
	var s string
	var words []string
	use_hmm := true
	x := gojieba.NewJieba()
	defer x.Free()

	s = "我来到北京清华大学"
	words = x.CutAll(s)
	fmt.Println(s)
	fmt.Println("全模式:", strings.Join(words, "/"))

	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("精确模式:", strings.Join(words, "/"))
	s = "比特币"
	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("精确模式:", strings.Join(words, "/"))

	x.AddWord("比特币")
	s = "比特币"
	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("添加词典后,精确模式:", strings.Join(words, "/"))

	s = "他来到了网易杭研大厦"
	words = x.Cut(s, use_hmm)
	fmt.Println(s)
	fmt.Println("新词识别:", strings.Join(words, "/"))

	s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"
	words = x.CutForSearch(s, use_hmm)
	fmt.Println(s)
	fmt.Println("搜索引擎模式:", strings.Join(words, "/"))

	s = "长春市长春药店"
	words = x.Tag(s)
	fmt.Println(s)
	fmt.Println("词性标注:", strings.Join(words, ","))

	s = "区块链"
	words = x.Tag(s)
	fmt.Println(s)
	fmt.Println("词性标注:", strings.Join(words, ","))

	s = "长江大桥"
	words = x.CutForSearch(s, !use_hmm)
	fmt.Println(s)
	fmt.Println("搜索引擎模式:", strings.Join(words, "/"))

	wordinfos := x.Tokenize(s, gojieba.SearchMode, !use_hmm)
	fmt.Println(s)
	fmt.Println("Tokenize:(搜索引擎模式)", wordinfos)

	wordinfos = x.Tokenize(s, gojieba.DefaultMode, !use_hmm)
	fmt.Println(s)
	fmt.Println("Tokenize:(默认模式)", wordinfos)

	keywords := x.ExtractWithWeight(s, 5)
	fmt.Println("Extract:", keywords)
}
我来到北京清华大学
全模式: 我/来到/北京/清华/清华大学/华大/大学
我来到北京清华大学
精确模式: 我/来到/北京/清华大学
比特币
精确模式: 比特/币
比特币
添加词典后,精确模式: 比特币
他来到了网易杭研大厦
新词识别: 他/来到/了/网易/杭研/大厦
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
搜索引擎模式: 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
长春市长春药店
词性标注: 长春市/ns,长春/ns,药店/n
区块链
词性标注: 区块链/nz
长江大桥
搜索引擎模式: 长江/大桥/长江大桥
长江大桥
Tokenize: [{长江 0 6} {大桥 6 12} {长江大桥 0 12}]

See example in jieba_test, extractor_test

Benchmark

Jieba中文分词系列性能评测

Unittest

go test ./...

Benchmark

go test -bench "Jieba" -test.benchtime 10s
go test -bench "Extractor" -test.benchtime 10s

Contributors

Code Contributors

This project exists thanks to all the people who contribute.

Contact

Owner
Yanyi Wu
Recommendation System
Yanyi Wu
Comments
  • Issue: Mac OS 下build Linux 版本提示 undefined: gojieba.NewJieba

    Issue: Mac OS 下build Linux 版本提示 undefined: gojieba.NewJieba

    package main
    
    import (
        "fmt"
    
        "github.com/yanyiwu/gojieba"
    )
    
    func main() {
        res := SplitWords("北京欢迎你", "all", "")
        fmt.Println(res)
    }
    
    func SplitWords(text, model, dict string) []string {
        var words []string
        jb := gojieba.NewJieba()
    
        if dict != "" {
            jb.AddWord(dict)
        }
        defer jb.Free()
    
        switch model {
        case "all":
            words = jb.CutAll(text)
        case "accurate":
            words = jb.Cut(text, true)
        }
        return words
    }
    
    > export GOOS=linux
    > go build ts.go
    > # command-line-arguments
    > ./ts.go:16:8: undefined: gojieba.NewJieba
    
    • 环境:mac OS High Sierra 10.13.6 (17G65)
    • 如果改为 export GOOS=darwin则正常build
    • GOOS为: linux 或 windows都会出现此问题
  • 提问:70M左右的网页数据,索引出的文件在1.6G是否正常?

    提问:70M左右的网页数据,索引出的文件在1.6G是否正常?

    第一次索引完毕,吓一跳。

        1.6 GiB [##########] /tmp                                                                                                                         
       72.1 MiB [          ] /bleve
    

    bleve目录为网页目录,tmp目录为索引目录

  • Example panic

    Example panic

    package main
    
    import (
        "fmt"
        "strings"
    
        "github.com/yanyiwu/gojieba"
    )
    
    func main() {
        var s string
        var words []string
        use_hmm := true
        x := gojieba.NewJieba()
        defer x.Free()
    
        s = "我来到北京清华大学"
        words = x.CutAll(s)
        fmt.Println(s)
        fmt.Println("全模式:", strings.Join(words, "/"))
    
        words = x.Cut(s, use_hmm)
        fmt.Println(s)
        fmt.Println("精确模式:", strings.Join(words, "/"))
        s = "比特币"
        words = x.Cut(s, use_hmm)
        fmt.Println(s)
        fmt.Println("精确模式:", strings.Join(words, "/"))
    
        x.AddWord("比特币")
        s = "比特币"
        words = x.Cut(s, use_hmm)
        fmt.Println(s)
        fmt.Println("添加词典后,精确模式:", strings.Join(words, "/"))
    
    
        s = "他来到了网易杭研大厦"
        words = x.Cut(s, use_hmm)
        fmt.Println(s)
        fmt.Println("新词识别:", strings.Join(words, "/"))
    
        s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"
        words = x.CutForSearch(s, use_hmm)
        fmt.Println(s)
        fmt.Println("搜索引擎模式:", strings.Join(words, "/"))
    
        s = "长春市长春药店"
        words = x.Tag(s)
        fmt.Println(s)
        fmt.Println("词性标注:", strings.Join(words, ","))
    
        s = "区块链"
        words = x.Tag(s)
        fmt.Println(s)
        fmt.Println("词性标注:", strings.Join(words, ","))
    
        s = "长江大桥"
        words = x.CutForSearch(s, !use_hmm)
        fmt.Println(s)
        fmt.Println("搜索引擎模式:", strings.Join(words, "/"))
    
        wordinfos := x.Tokenize(s, gojieba.SearchMode, !use_hmm)
        fmt.Println(s)
        fmt.Println("Tokenize:(搜索引擎模式)", wordinfos)
    
        wordinfos = x.Tokenize(s, gojieba.DefaultMode, !use_hmm)
        fmt.Println(s)
        fmt.Println("Tokenize:(默认模式)", wordinfos)
    
        ex := gojieba.NewExtractor()
        defer ex.Free()
        keywords := ex.ExtractWithWeight(s, 5)
        fmt.Println("Extract:", keywords)
    }
    
    2016-09-06 16:10:10 ./deps/cppjieba/DictTrie.hpp:153 FATAL exp: [ifs.is_open()] false. open /Users/lingchax/.go/src/github.com/yanyiwu/gojieba/dict/jieba.dict.utf8 failed.
    SIGABRT: abort
    PC=0x7fff9080cf06 m=0
    signal arrived during cgo execution
    
    goroutine 1 [syscall, locked to thread]:
    runtime.cgocall(0x40925b0, 0xc42004f9a0, 0xc400000000)
            /usr/local/Cellar/go/1.7/libexec/src/runtime/cgocall.go:131 +0x110 fp=0xc42004f970 sp=0xc42004f930
    hello/vendor/github.com/yanyiwu/gojieba._Cfunc_NewJieba(0x4503290, 0x45032e0, 0x4503370, 0x0)
            ??:0 +0x4e fp=0xc42004f9a0 sp=0xc42004f970
    hello/vendor/github.com/yanyiwu/gojieba.NewJieba(0x0, 0x0, 0x0, 0x0)
            /Users/lingchax/.go/src/hello/vendor/github.com/yanyiwu/gojieba/jieba.go:37 +0x1b3 fp=0xc42004fa90 sp=0xc42004f9a0
    main.main()
            /Users/lingchax/.go/src/hello/main.go:14 +0x51 fp=0xc42004ff48 sp=0xc42004fa90
    runtime.main()
            /usr/local/Cellar/go/1.7/libexec/src/runtime/proc.go:183 +0x1f4 fp=0xc42004ffa0 sp=0xc42004ff48
    runtime.goexit()
            /usr/local/Cellar/go/1.7/libexec/src/runtime/asm_amd64.s:2086 +0x1 fp=0xc42004ffa8 sp=0xc42004ffa0
    
    goroutine 17 [syscall, locked to thread]:
    runtime.goexit()
            /usr/local/Cellar/go/1.7/libexec/src/runtime/asm_amd64.s:2086 +0x1
    
    rax    0x0
    rbx    0x6
    rcx    0x7fff5fbff0f8
    rdx    0x0
    rdi    0x307
    rsi    0x6
    rbp    0x7fff5fbff120
    rsp    0x7fff5fbff0f8
    r8     0x8
    r9     0x0
    r10    0x8000000
    r11    0x206
    r12    0x7fff5fbff40a
    r13    0x4802000
    r14    0x7fff78114000
    r15    0x7fff759af398
    rip    0x7fff9080cf06
    rflags 0x206
    cs     0x7
    fs     0x0
    gs     0x0
    exit status 2
    

    go env:

    GOARCH="amd64"
    GOBIN=""
    GOEXE=""
    GOHOSTARCH="amd64"
    GOHOSTOS="darwin"
    GOOS="darwin"
    GOPATH="/Users/lingchax/.go"
    GORACE=""
    GOROOT="/usr/local/Cellar/go/1.7/libexec"
    GOTOOLDIR="/usr/local/Cellar/go/1.7/libexec/pkg/tool/darwin_amd64"
    CC="clang"
    GOGCCFLAGS="-fPIC -m64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -fdebug-prefix-map=/var/folders/1x/d4dgqvms23bgmjpyshv4j6580000gn/T/go-build873055974=/tmp/go-build -gno-record-gcc-switches -fno-common"
    CXX="clang++"
    CGO_ENABLED="1"
    
  • NewJieba() memory leak [resolved] (it is go test's problem)

    NewJieba() memory leak [resolved] (it is go test's problem)

    env

    ❯ go version
    go version go1.12.5 linux/amd64
    
    ❯ gcc -v
    Using built-in specs.
    COLLECT_GCC=gcc
    COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-pc-linux-gnu/8.3.0/lto-wrapper
    Target: x86_64-pc-linux-gnu
    Configured with: /build/gcc/src/gcc/configure --prefix=/usr --libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=https://bugs.archlinux.org/ --enable-languages=c,c++,ada,fortran,go,lto,objc,obj-c++ --enable-shared --enable-threads=posix --enable-libmpx --with-system-zlib --with-isl --enable-__cxa_atexit --disable-libunwind-exceptions --enable-clocale=gnu --disable-libstdcxx-pch --disable-libssp --enable-gnu-unique-object --enable-linker-build-id --enable-lto --enable-plugin --enable-install-libiberty --with-linker-hash-style=gnu --enable-gnu-indirect-function --enable-multilib --disable-werror --enable-checking=release --enable-default-pie --enable-default-ssp --enable-cet=auto
    Thread model: posix
    gcc version 8.3.0 (GCC) 
    

    add below code to jieba_test.go:

    func BenchmarkNewJiebaMemLeak(b *testing.B) {
    	b.ResetTimer()
    	//equals with x := NewJieba(DICT_PATH, HMM_PATH, USER_DICT_PATH)
    	x := NewJieba()
    	defer x.Free()
    	// Stop Timer before x.Free()
    	defer b.StopTimer()
    }
    

    run the bench and watch memory usage:

    go test -bench=BenchmarkNewJiebaMemLeak .
    

    we can see memory usage from 76.0 MiB to 718.3 MiB, does x.Free() really free the memory?

    [root@8700k hacklog]# ps_mem -w 1 | grep jieba
    118.6 MiB +  48.5 KiB = 118.6 MiB	gojieba.test
    118.6 MiB +  48.5 KiB = 118.7 MiB	gojieba.test
    331.1 MiB +  48.5 KiB = 331.1 MiB	gojieba.test
    359.6 MiB +  48.5 KiB = 359.7 MiB	gojieba.test
    475.1 MiB +  48.5 KiB = 475.1 MiB	gojieba.test
    589.6 MiB +  48.5 KiB = 589.6 MiB	gojieba.test
    568.3 MiB +  48.5 KiB = 568.4 MiB	gojieba.test
    675.7 MiB +  48.5 KiB = 675.8 MiB	gojieba.test
    633.1 MiB +  48.5 KiB = 633.2 MiB	gojieba.test
    

    ps_mem is A utility to accurately report the in core memory usage for a program, you can install it by pip install ps_mem

    the result:

    ❯ go test -test.bench=BenchmarkNewJiebaMemLeak -test.benchmem
    /home/hacklog/go/src/github.com/yanyiwu/gojieba/config_test.go
    goos: linux
    goarch: amd64
    pkg: github.com/yanyiwu/gojieba
    BenchmarkNewJiebaMemLeak-12    	2000000000	         0.20 ns/op	       0 B/op	       0 allocs/op
    PASS
    ok  	github.com/yanyiwu/gojieba	10.649s
    
    

    if comment out the Free() call (defer x.Free()), the result will be

     79.9 MiB +  41.5 KiB =  79.9 MiB	gojieba.test
    118.6 MiB +  41.5 KiB = 118.7 MiB	gojieba.test
    424.5 MiB +  41.5 KiB = 424.6 MiB	gojieba.test
    716.5 MiB +  41.5 KiB = 716.6 MiB	gojieba.test
    987.3 MiB +  41.5 KiB = 987.3 MiB	gojieba.test
      1.3 GiB +  41.5 KiB =   1.3 GiB	gojieba.test
      1.6 GiB +  41.5 KiB =   1.6 GiB	gojieba.test
      1.9 GiB +  41.5 KiB =   1.9 GiB	gojieba.test
      2.2 GiB +  41.5 KiB =   2.2 GiB	gojieba.test
    
  • 在不同的包里进行初始化时没办法使用

    在不同的包里进行初始化时没办法使用

    在同一个包里操作时,可以使用,但我项目里有一个初始化包 将 NewJieba放在初始化包里初始化,其它包调用这个*Jieba对象时出错 错误信息 fatal error: unexpected signal during runtime execution [signal SIGSEGV: segmentation violation code=0x1 addr=0x34b6808 pc=0x9b9b8a]

    runtime stack: runtime.throw(0xafb348, 0x2a) /data/go/src/runtime/panic.go:566 +0x95 runtime.sigpanic() /data/go/src/runtime/sigpanic_unix.go:12 +0x2cc

    goroutine 285 [syscall, locked to thread]: runtime.cgocall(0x9a8630, 0xc4206d7518, 0xc400000000) /data/go/src/runtime/cgocall.go:131 +0x110 fp=0xc4206d74d0 sp=0xc4206d7490 github.com/yanyiwu/gojieba._Cfunc_ExtractWithWeight(0x20b75f0, 0x7f5be80008c0, 0x4, 0x0) ??:0 +0x4e fp=0xc4206d7518 sp=0xc4206d74d0 github.com/yanyiwu/gojieba.(*Jieba).ExtractWithWeight(0xc42002e4d0, 0xc420c48300, 0x27, 0x4, 0x0, 0x0, 0x0) /data/gowork/src/github.com/yanyiwu/gojieba/jieba.go:130 +0x10c fp=0xc4206d7588 sp=0xc4206d7518

  • 安装后运行不成功啊。

    安装后运行不成功啊。

    报这个错:2016-08-19 10:17:27 ./deps/cppjieba/DictTrie.hpp:153 FATAL exp: [ifs.is_open()] false. open /Users/hqw/Desktop:/Users/hqw/Desktop/gofiles/src/github.com/yanyiwu/gojieba/dict/jieba.dict.utf8 failed. SIGABRT: abort PC=0x7fff8aead866 m=0 signal arrived during cgo execution

    goroutine 1 [syscall, locked to thread]: runtime.cgocall(0x40b6290, 0xc82004dca8, 0xc800000000) /usr/local/go/src/runtime/cgocall.go:123 +0x11b fp=0xc82004dc70 sp=0xc82004dc40 github.com/yanyiwu/gojieba._Cfunc_NewJieba(0x4500000, 0x4500070, 0x45000e0, 0x0) ??:0 +0x42 fp=0xc82004dca8 sp=0xc82004dc70 github.com/yanyiwu/gojieba.NewJieba(0x0, 0x0, 0x0, 0x0) /Users/hqw/Desktop/gofiles/src/github.com/yanyiwu/gojieba/jieba.go:37 +0x22d fp=0xc82004dda0 sp=0xc82004dca8 main.main() /Users/hqw/gojieba.go:14 +0x62 fp=0xc82004df50 sp=0xc82004dda0 runtime.main() /usr/local/go/src/runtime/proc.go:188 +0x2b0 fp=0xc82004dfa0 sp=0xc82004df50 runtime.goexit() /usr/local/go/src/runtime/asm_amd64.s:1998 +0x1 fp=0xc82004dfa8 sp=0xc82004dfa0

    goroutine 17 [syscall, locked to thread]: runtime.goexit() /usr/local/go/src/runtime/asm_amd64.s:1998 +0x1

    rax 0x0 rbx 0x7fff72e77310 rcx 0x7fff5fbff0e8 rdx 0x0 rdi 0x303 rsi 0x6 rbp 0x7fff5fbff110 rsp 0x7fff5fbff0e8 r8 0x4600100 r9 0x0 r10 0x8000000 r11 0x206 r12 0x500000a r13 0x5000000 r14 0x6 r15 0x7fff74e55398 rip 0x7fff8aead866 rflags 0x206 cs 0x7 fs 0x0 gs 0x72e70000 exit status 2 错误: 进程退出代码 1.

  • go1.17 get报错 cc1: error: unrecognized command line option

    go1.17 get报错 cc1: error: unrecognized command line option "-fno-lto"

    [mylinux]$ go get -t "github.com/yanyiwu/gojieba"

    # github.com/yanyiwu/gojieba
    cgo: gcc did not produce error at completed:1
    on input:
    
    #line 1 "cgo-builtin-prolog"
    #include <stddef.h> /* for ptrdiff_t and size_t below */
    
    /* Define intgo when compiling with GCC.  */
    typedef ptrdiff_t intgo;
    
    #define GO_CGO_GOSTRING_TYPEDEF
    typedef struct { const char *p; intgo n; } _GoString_;
    typedef struct { char *p; intgo n; intgo c; } _GoBytes_;
    _GoString_ GoString(char *p);
    _GoString_ GoStringN(char *p, int l);
    _GoBytes_ GoBytes(void *p, int n);
    char *CString(_GoString_);
    void *CBytes(_GoBytes_);
    void *_CMalloc(size_t);
    
    __attribute__ ((unused))
    static size_t _GoStringLen(_GoString_ s) { return (size_t)s.n; }
    
    __attribute__ ((unused))
    static const char *_GoStringPtr(_GoString_ s) { return s.p; }
    #line 3 "/home/bae/go-env/goSpace/pkg/mod/github.com/yanyiwu/[email protected]/jieba.go"
    
    
    #include <stdlib.h>
    #include "jieba.h"
    
    #line 1 "cgo-generated-wrapper"
    #line 1 "not-declared"
    
    balabala..........................
    
    void __cgo_f_20_5(void) { static const char __cgo_undefined__5[] = (int); }
    #line 1 "completed"
    int __cgo__1 = __cgo__2;
    
    full error output:
    cc1: error: unrecognized command line option "-fno-lto"
    
  • deps/limonp/LocalVector.hpp: fixup gcc8 warnings

    deps/limonp/LocalVector.hpp: fixup gcc8 warnings

    deps/limonp/LocalVector.hpp: fixup gcc8 warnings

    ref to https://github.com/facebook/rocksdb/pull/3736/files

    this should fixup https://github.com/yanyiwu/gojieba/issues/68

  • Create LICENSE file

    Create LICENSE file

    LICENSE file means this repo will play nicely with tools like https://github.com/pmezard/licenses.

    NOTE You are free to change the copyright line to your own name/company.

  • deps/cppjieba/DictTrie.hpp:153 FATAL exp: [ifs.is_open()] false. open

    deps/cppjieba/DictTrie.hpp:153 FATAL exp: [ifs.is_open()] false. open

    您好,项目导入了您的gojieba包,但是运行出错,win7 64位,请问这是什么原因? gcc用的是mingw-w64 版本:x86_64-6.3.0-release-posix-seh-rt_v5-rev1

    ./deps/cppjieba/DictTrie.hpp:153 FATAL exp: [ifs.is_open()] false. open /workspace/goWorkSpace/src/github.com/yanyiwu/gojieba/dict/jieba.dict.utf8 failed.
    
    This application has requested the Runtime to terminate it in an unusual way.
    Please contact the application's support team for more information.
    
  • fix[cgo]: Fix - memory allocted by cgo will not automaticlly free by gc.

    fix[cgo]: Fix - memory allocted by cgo will not automaticlly free by gc.

    Currently, when an object containing a jieba instance is dropped, the memory allocated to jieba will not be freed automatically.

    This Pull Request uses runtime.SetFinalizer to invoke Free and avoid memory leaking in the above cases.

  • x.Addword 不生效

    x.Addword 不生效

    divider = gojieba.NewJieba()
    defer divider.Free()
    divider.AddWord("望城燃气")
    divider.AddWord("国航")
    divider.AddWord("桑德")
    divider.AddWord("新能源")
    s := "河北桑德新能源动力科技有限公司"
    words := divider.Tag(s)
    fmt.Println(words)
    

    /////[河北/ns 桑德/x 新/a 能源动力/n 科技/n 有限公司/n]

  •  I used it in the goframe framework project. In the Ubuntu, the compilation will report an error

    I used it in the goframe framework project. In the Ubuntu, the compilation will report an error

    The error info:

    2022-02-23 01:08:29.566 go build -o ./bin/linux_amd64/main .\main.go 
    2022-02-23 01:08:31.543 failed to build, os:linux, arch:amd64, error:
    # gaosomebloggfv1/app/service
    app\service\search.go:16:15: undefined: gojieba.NewJieba
    
    

    I've already installed it and "go mod tidy"

  • Free函数不能有效释放内存会造成内存泄漏

    Free函数不能有效释放内存会造成内存泄漏

    近日发现服务器内存RES不断上涨,最终定位到jieba这里存在内存泄漏问题,测试代码如下:

    GO VERSION:1.16.7

    func main() {
      loop:
      var list []*gojieba.Jieba
      for i := 0; i < count; i++ {
          // todo: paths not define
          t := gojieba.NewJieba(paths...)
          list = append(list, t)
      }
    
      time.Sleep(5 * time.Second)
    
      for i := range list {
          list[i].Free()
      }
    
      goto loop
    }
    

    通过上述代码最终会因内存太大造成OOM进程被杀掉