用Go统计文件中单词的个数

=Start=

缘由：

每学习一门新语言，我就会拿刚学会的语法去实现一些小功能/程序，以此加深对语法的认识和熟悉程度。这其中比较好玩又实用的就包括统计文件中单词的个数，涉及到文件操作、字符串操作、去重操作等，是一个比较好的学习案例。

参考解答：

废话不多说，代码如下：

package main

import (
    "bufio"
    "fmt"
    "io"
    "log"
    "os"
    "path/filepath"
    "runtime"
    "sort"
    "strings"
    "unicode"
    "unicode/utf8"
)

func main() {
    if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" {
        fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]\n", filepath.Base(os.Args[0]))
        os.Exit(1)
    }

    frequencyForWord := map[string]int{} // Same as: make(map[string]int)
    for _, filename := range commandLineFiles(os.Args[1:]) {
        updateFrequencies(filename, frequencyForWord)
    }
    // fmt.Println(frequencyForWord)
    // reportByWords(frequencyForWord)
    wordsForFrequency := invertStringIntMap(frequencyForWord)
    // fmt.Println(wordsForFrequency)
    reportByFrequency(wordsForFrequency)
}

func commandLineFiles(files []string) []string {
    if runtime.GOOS == "windows" {
        args := make([]string, 0, len(files))
        for _, name := range files {
            if matches, err := filepath.Glob(name); err != nil {
                args = append(args, name) // Invalid pattern
            } else if matches != nil { // At least one match
                args = append(args, matches...)
            }
        }
        return args
    }
    return files
}

func updateFrequencies(filename string, frequencyForWord map[string]int) {
    var file *os.File
    var err error
    if file, err = os.Open(filename); err != nil {
        log.Println("failed to open the file: ", err)
        return
    }
    defer file.Close()
    reader := bufio.NewReader(file)
    for {
        line, err := reader.ReadString('\n')
        for _, word := range SplitOnNonLetters(strings.TrimSpace(line)) {
            if len(word) > utf8.UTFMax || utf8.RuneCountInString(word) > 1 {
                frequencyForWord[strings.ToLower(word)] += 1
            }
        }
        if err != nil {
            if err != io.EOF {
                log.Println("failed to finish reading the file: ", err)
            }
            break
        }
    }
}

// SplitOnNonLetters() 对 s 用 非字母字符 进行切分，返回 s 中包含的单词列表
func SplitOnNonLetters(s string) []string {
    notALetter := func(char rune) bool { return !unicode.IsLetter(char) }
    return strings.FieldsFunc(s, notALetter)
}

func reportByWords(frequencyForWord map[string]int) {
    words := make([]string, 0, len(frequencyForWord))
    wordWidth, frequencyWidth := 0, 0
    for word, frequency := range frequencyForWord {
        words = append(words, word)
        if width := utf8.RuneCountInString(word); width > wordWidth {
            wordWidth = width
        }
        if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
            frequencyWidth = width
        }
    }
    sort.Strings(words)
    gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
    fmt.Printf("Word %*s%s\n", gap, " ", "Frequency") // fmtp.Printf 的 %*s 接收两个参数——最大宽度 & 要打印的字符串
    for _, word := range words {
        fmt.Printf("%-*s %*d\n", wordWidth, word, frequencyWidth, frequencyForWord[word])
    }
}

// 对 map[string][int] 进行反转时，需要注意 int 可能会有相同大小，而 string 不应该被覆盖，而应该被添加
// 所以反转之后的类型为 map[int][]string
func invertStringIntMap(stringIntMap map[string]int) map[int][]string {
    intStrArrayMap := make(map[int][]string, len(stringIntMap))
    for key, value := range stringIntMap {
        intStrArrayMap[value] = append(intStrArrayMap[value], key)
    }
    return intStrArrayMap
}

func reportByFrequency(wordsForFrequency map[int][]string) {
    frequencies := make([]int, 0, len(wordsForFrequency)) // length = 0, capacity = len(wordsForFrequency)
    for frequency := range wordsForFrequency {
        frequencies = append(frequencies, frequency)
    }
    sort.Ints(frequencies)
    width := len(fmt.Sprint(frequencies[len(frequencies)-1])) // 因为 frequencies 已经是排过序了的，所以最后一个元素是最宽的
    fmt.Println("Frequency → Words")
    for _, frequency := range frequencies {
        words := wordsForFrequency[frequency]
        sort.Strings(words)
        fmt.Printf("%*d %s\n", width, frequency, strings.Join(words, ", "))
    }
}

参考链接：

《Go语言程序设计》

=EOF=

21 11 月, 2015

admin

KnowledgeBase, Programing, Tools

Go, golang, map, tips

《 “用Go统计文件中单词的个数” 》有 11 条评论

a-z说道：

2017-03-16 20:11

Go 的三种不同 md5 计算方式性能比较
http://holys.im/2016/11/24/3-kind-of-md5-sum/
`
ioutil.ReadFile
io.Copy
io.Copy + bufio.Reader

以上这三种不同的 md5 计算方式在执行时间上都差不多，区别最大的是内存的分配上；
bufio 在处理 I/O 还是很有优势的，优先选择；
尽量避免 ReadAll 这种用法。
`

回复
a-z说道：

2017-06-03 10:32

探测局域网里面的设备
http://blog.cyeam.com/network/2015/03/16/fing
https://github.com/mnhkahn/go_code/blob/master/fing.go

Golang实现多线程并发下载
http://blog.cyeam.com/network/2015/07/02/goget
https://github.com/mnhkahn/go_code/blob/master/goget/goget.go

使用Go编写代码明信片生成器
http://cjting.me/golang/write-a-code-post-generator-with-go/
https://github.com/fate-lovely/codeposter

回复
- a-z说道：
  
  2017-10-30 13:46
  
  用Go开发可以内网活跃主机嗅探器 #1
  https://github.com/timest/goscan/issues/1
  `
  程序思路：
  　　通过内网IP和子网掩码计算出内网IP范围
  　　向内网广播ARP Request
  　　监听并抓取ARP Response包，记录IP和Mac地址
  　　发活跃IP发送MDNS和NBNS包，并监听和解析Hostname
  　　根据Mac地址计算出厂家信息
  `
  
  回复
a-z说道：

2017-06-30 23:54

[译]使用os/exec执行命令
http://colobu.com/2017/06/19/advanced-command-execution-in-Go-with-os-exec/
https://blog.kowalczyk.info/article/wOYk/advanced-command-execution-in-go-with-osexec.html
https://github.com/kjk/go-cookbook/blob/master/advanced-exec

udp编程的那些事与golang udp的实践
https://my.oschina.net/u/2374678/blog/983427

回复
hi说道：

2018-04-19 21:37

用go实现常用算法与数据结构——跳跃表(Skip list)
http://www.cnblogs.com/DilonWu/p/8857061.html
https://github.com/AceDarkknight/ConcurrentSkipList

回复
hi说道：

2018-07-26 19:41

golang bufio、ioutil读文件的速度比较（性能测试）和影响因素分析
https://segmentfault.com/a/1190000011680507
`
当每次读取块的大小小于4KB，建议使用bufio.NewReader(f), 大于4KB用bufio.NewReaderSize(f,缓存大小)
要读Reader, 图方便用ioutil.ReadAll()
一次性读取文件，使用ioutil.ReadFile()
不同业务场景，选用不同的读取方式
`

回复
hi说道：

2018-07-26 19:42

Golang学习 – bufio 包
http://www.cnblogs.com/golove/p/3282667.html
Golang学习 – io 包
http://www.cnblogs.com/golove/p/3276678.html

回复
hi说道：

2018-07-26 19:44

数据说话：Go语言的Switch和Map性能实测
https://segmentfault.com/a/1190000011361164
https://hashrocket.com/blog/posts/switch-vs-map-which-is-the-better-way-to-branch-in-go

回复
hi说道：

2018-08-09 15:58

Filebeat优化实践
https://wilhelmguo.tk/blog/post/william/Filebeat%E4%BC%98%E5%8C%96%E5%AE%9E%E8%B7%B5

记一次获得3倍性能的Go程序优化实践
https://mp.weixin.qq.com/s/0iYRCAq5-vXJfuk3ueIppA

回复
hi说道：

2019-05-27 21:27

深度解密Go语言之map
https://mp.weixin.qq.com/s/2CDpE5wfoiNXm1agMAq4wA
`
什么是 map
为什么要用 map
map 的底层如何实现
　　map 内存模型
　　创建 map
　　哈希函数
　　key 定位过程
　　map 的两种 get 操作
　　如何进行扩容
　　map 的遍历
　　map 的赋值
　　map 的删除
map 进阶
　　可以边遍历边删除吗？
　　key可以是float类型吗？
总结
参考资料
`

回复
hi说道：

2019-06-26 15:46

以Go的map是否并发安全为例，介绍最权威的Go语言资料的使用方法
https://www.lijiaocn.com/%E7%BC%96%E7%A8%8B/2019/06/11/golang-map-concurrent.html
`
找到正确的资料、能够正确的使用、正确的理解，是最关键的一步。除非是初学者，否则不要使用二手、三手和倒了无数手的资料，长期来看使用非一手资料，就是在浪费时间和引入错误。第一手的资料常常晦涩难懂，需要经过长时间的积累和沉淀，才能比较熟练的运用，刚开始的时候可以用二手、三手的资料帮助理解，但一定要逼迫自己在一手资料中找到解答，这个过程会极大地提升认知。

说明
正确使用正确的资料
最权威的 Go 语言资料是？
Go 语言的 map 是否是并发安全的？
扩大搜索范围
找到答案不等于结束
为什么要执着于一手资料？
参考
`

回复

ASPIRE

用Go统计文件中单词的个数

缘由：

参考解答：

参考链接：

《 “用Go统计文件中单词的个数” 》有 11 条评论

回复 a-z 取消回复

用Go统计文件中单词的个数

缘由：

参考解答：

参考链接：

《 “用Go统计文件中单词的个数” 》 有 11 条评论

回复 a-z 取消回复

《 “用Go统计文件中单词的个数” 》有 11 条评论