1
0
Fork 0
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

186 lines
7.8 KiB

import cheerio from 'cheerio'
import http from 'http'
import path from 'path'
const fs = require('fs');
const Province = () => ({
code: '',
name: '',
children: []
})
const Area = () => ({
code: '',
name: '',
})
const City = () => ({
code: '',
name: '',
children: []
})
class GetMcaGovData {
sourceUrl = ''
headerClass = ''
cityClass = ''
constructor(sourceUrl, headerClass, cityClass) {
this.sourceUrl = sourceUrl
this.headerClass = headerClass
this.cityClass = cityClass
}
loadData = () => {
if (!this.sourceUrl) {
throw new Error('not set the url of parser !')
}
if (!this.headerClass || !this.cityClass) {
throw new Error('not set the city or header class of header !')
}
try {
http.get(this.sourceUrl, (res) => {
// 设置编码
res.setEncoding('utf8');
// 当接收到数据时,会触发 'data' 事件的执行
let html = "";
res.on('data', (data) => {
html += data;
});
// 数据接收完毕,会触发 'end' 事件的执行
res.on('end', () => {
const $ = cheerio.load(html);
// 去除里面的空格和空值
let elementsArea = $('.' + this.cityClass)
// 注意这里的filter用的是cheerio的filter不是es6的
elementsArea = elementsArea.filter((index, item) => $(item).text().trim())
let elementsProAndCity = $('.' + this.headerClass)
elementsProAndCity = elementsProAndCity.filter((index, item) => $(item).text().trim())
console.log('省市总计数量:' + elementsProAndCity.length / 2)
console.log('区总计数量:' + elementsArea.length / 2)
let total = (elementsArea.length + elementsProAndCity.length) / 2
console.log('省市区总计数量:' + total)
const listProvince = []
for(let i = 0; i <= elementsProAndCity.length; i += 2) {
const codeOrName = $(elementsProAndCity[i]).text().trim()
const next = $(elementsProAndCity[i + 1]).text().trim()
if (/\d/.test(codeOrName)) {
// 省份
if (codeOrName.endsWith('0000')) {
const province = new Province()
province.name = next
province.code = codeOrName
province.children = province.children || []
listProvince.push(province)
} else { // 市
const city = new City()
city.name = next
city.code = codeOrName
city.children = city.children || []
// 省份前缀
const prefixProvinceCode = codeOrName.substring(0, 2)
// 市区前缀
const prefixCityCode = codeOrName.substring(2, 4)
const provinceRegexp = new RegExp(`^${prefixProvinceCode}`)
// 市前缀匹配,加入到省份里面
const province = listProvince.find(item => {
return provinceRegexp.test(item.code)
})
province && province.children.push(city)
}
}
}
// 处理区和县
listProvince.forEach(item => {
// 省份前缀
const prefixProvinceCode = item.code.substring(0, 2)
const cityList = item.children
// 对于区,一个个处理,处理一个删除一个
do {
let codeOrName = $(elementsArea[0]).text().trim()
let next = $(elementsArea[1]).text().trim()
// 匹配省份
let regExp = new RegExp(`^${prefixProvinceCode}`)
if (/\d/.test(codeOrName)) {
if (regExp.test(codeOrName)) {
const area = new Area()
area.code = codeOrName
area.name = next
// 取区中间两位市的代号
const prefixCityCode = codeOrName.substring(2, 4)
regExp = new RegExp(`^${prefixProvinceCode}${prefixCityCode}`)
// 找出市,找到就加入到市里的下面的区
const currentCity = cityList.find(cityItem => regExp.test(cityItem.code) && cityItem.code.endsWith('00'))
if (cityList.length && currentCity) {
currentCity.children.push(area)
} else {
// 解析直辖市下面的区和县
if (cityList.length === 0) {
const city = new City()
city.name = item.name
city.code = item.code
city.children.push(area)
cityList.push(city)
} else {
cityList[0].children.push(area)
}
}
elementsArea.splice(0, 2)
} else {
break
}
}
} while (elementsArea.length > 0)
})
let i = 0
listProvince.forEach(p => {
i++
p.children.forEach(c => {
i++
c.children && c.children.forEach(a => {
i++
})
})
})
// 多了4个直辖市
const parseTotal = i - 4
console.log('解析完成总计数量:' + parseTotal, total)
console.log('解析数量是否相等:' + (parseTotal === total ? '相等' : '不相等'))
if (parseTotal === total) {
fs.writeFile(path.join(__dirname, 'provinceList.json'), JSON.stringify(listProvince), function(err) {
if (err)
return;
console.log('导出成功')
});
} else {
throw new Error('解析前后数量不相等,解析失败!')
}
})
});
} catch (e) {
throw new Error('parse with error !')
}
}
}
// headerClass和cityClass在统计局的官网查看css的class
const data = new GetMcaGovData(
'http://www.mca.gov.cn/article/sj/xzqh/2020/2020/2020112010001.html',
'xl7014987',
'xl7114987'
)
data.loadData()