148 lines
3.0 KiB
Go
148 lines
3.0 KiB
Go
|
package rime
|
|||
|
|
|||
|
import (
|
|||
|
"bufio"
|
|||
|
"fmt"
|
|||
|
mapset "github.com/deckarep/golang-set/v2"
|
|||
|
"log"
|
|||
|
"os"
|
|||
|
"strconv"
|
|||
|
"strings"
|
|||
|
"unicode/utf8"
|
|||
|
)
|
|||
|
|
|||
|
// 一些临时用的函数
|
|||
|
|
|||
|
func Temp() {
|
|||
|
// GeneratePinyinTest("你的行动力")
|
|||
|
// GeneratePinyinTest("都挺长的")
|
|||
|
// GeneratePinyinTest("血条长")
|
|||
|
|
|||
|
// findP(BasePath, "血")
|
|||
|
// Pinyin(ExtPath)
|
|||
|
// AddWeight(ExtPath, 100)
|
|||
|
}
|
|||
|
|
|||
|
// 列出字表中多音字的状况:是否参与自动注音
|
|||
|
func polyphone() {
|
|||
|
// open file
|
|||
|
file, err := os.Open(HanziPath)
|
|||
|
if err != nil {
|
|||
|
log.Fatalln(err)
|
|||
|
}
|
|||
|
defer file.Close()
|
|||
|
|
|||
|
// 将所有读音读入 m
|
|||
|
type py struct {
|
|||
|
pinyin string
|
|||
|
weight int
|
|||
|
isAuto bool // 是否参与自动注音
|
|||
|
}
|
|||
|
m := make(map[string][]py)
|
|||
|
|
|||
|
sc := bufio.NewScanner(file)
|
|||
|
isMark := false
|
|||
|
for sc.Scan() {
|
|||
|
line := sc.Text()
|
|||
|
if !isMark {
|
|||
|
if line == "..." {
|
|||
|
isMark = true
|
|||
|
}
|
|||
|
continue
|
|||
|
}
|
|||
|
if line == "" || strings.HasPrefix(line, "#") {
|
|||
|
continue
|
|||
|
}
|
|||
|
parts := strings.Split(line, "\t")
|
|||
|
if len(parts) != 3 {
|
|||
|
log.Fatalln("len(parts) != 3", line)
|
|||
|
}
|
|||
|
hanzi, pinyin := parts[0], parts[1]
|
|||
|
weight, _ := strconv.Atoi(parts[2])
|
|||
|
m[hanzi] = append(m[hanzi], py{pinyin: pinyin, weight: weight})
|
|||
|
}
|
|||
|
|
|||
|
// 判断是否参与注音
|
|||
|
for hanzi, pys := range m {
|
|||
|
if len(pys) == 1 {
|
|||
|
continue
|
|||
|
}
|
|||
|
// 找到最大的权重
|
|||
|
max := 0
|
|||
|
for _, py := range pys {
|
|||
|
if py.weight > max {
|
|||
|
max = py.weight
|
|||
|
}
|
|||
|
}
|
|||
|
// 计算其他权重相较于 max 的比值,是否大于 0.05
|
|||
|
for i, py := range pys {
|
|||
|
if py.weight == max {
|
|||
|
m[hanzi][i].isAuto = true
|
|||
|
} else if float64(py.weight)/float64(max) > 0.05 {
|
|||
|
m[hanzi][i].isAuto = true
|
|||
|
}
|
|||
|
}
|
|||
|
// 输出
|
|||
|
fmt.Println(hanzi)
|
|||
|
for _, py := range pys {
|
|||
|
fmt.Println(py.pinyin, py.weight, py.isAuto)
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// 在词库中找到此行是否包含同义多音字,如果包含且长度大于等于3,从文件中删除这行,并将所有删除的行写入到 1.txt 中
|
|||
|
func findP(dictPath string, ch string) {
|
|||
|
// open file
|
|||
|
file, err := os.OpenFile(dictPath, os.O_RDWR, 0666)
|
|||
|
if err != nil {
|
|||
|
log.Fatalln(err)
|
|||
|
}
|
|||
|
defer file.Close()
|
|||
|
|
|||
|
outFile, err := os.Create("1.txt")
|
|||
|
if err != nil {
|
|||
|
log.Fatalln(err)
|
|||
|
}
|
|||
|
defer outFile.Close()
|
|||
|
|
|||
|
lines := make([]string, 0)
|
|||
|
|
|||
|
isMark := false
|
|||
|
sc := bufio.NewScanner(file)
|
|||
|
set := mapset.NewSet[string]() // 去重用的
|
|||
|
for sc.Scan() {
|
|||
|
line := sc.Text()
|
|||
|
if !isMark {
|
|||
|
lines = append(lines, line)
|
|||
|
if line == mark {
|
|||
|
isMark = true
|
|||
|
}
|
|||
|
continue
|
|||
|
}
|
|||
|
if line == "" || strings.HasPrefix(line, "#") {
|
|||
|
lines = append(lines, line)
|
|||
|
continue
|
|||
|
}
|
|||
|
parts := strings.Split(line, "\t")
|
|||
|
if len(parts) != 3 {
|
|||
|
log.Fatalln("len(parts) != 3", line)
|
|||
|
}
|
|||
|
text := parts[0]
|
|||
|
if strings.Contains(text, ch) && utf8.RuneCountInString(text) >= 3 && !set.Contains(text) {
|
|||
|
outFile.WriteString(line + "\n")
|
|||
|
} else {
|
|||
|
set.Add(text)
|
|||
|
lines = append(lines, line)
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// 从 lines 重新写入 file
|
|||
|
file.Truncate(0)
|
|||
|
file.Seek(0, 0)
|
|||
|
for _, line := range lines {
|
|||
|
file.WriteString(line + "\n")
|
|||
|
}
|
|||
|
|
|||
|
fmt.Println("done")
|
|||
|
}
|