parent
9e26ef4202
commit
45a9729855
@ -1,3 +1,13 @@
|
|||||||
# marseille-perils-webcrawler
|
# Environnement virtuel pour python
|
||||||
|
python3 -m venv scrapy-env
|
||||||
|
(unix-like) source scrapy-env/bin/activate
|
||||||
|
(windows) tutorial-env\Scripts\activate.bat
|
||||||
|
|
||||||
Un webcrawler basé sur Scrapy pour extraire les arrêtés de péril du site de la ville de Marseille
|
# Installation des dépendances
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install regex
|
||||||
|
python -m pip install scrapy
|
||||||
|
|
||||||
|
# Lancement de scrapy
|
||||||
|
cd src
|
||||||
|
scrapy crawl perils -O perils.csv
|
||||||
|
|||||||
Binary file not shown.
@ -0,0 +1,247 @@
|
|||||||
|
<#
|
||||||
|
.Synopsis
|
||||||
|
Activate a Python virtual environment for the current PowerShell session.
|
||||||
|
|
||||||
|
.Description
|
||||||
|
Pushes the python executable for a virtual environment to the front of the
|
||||||
|
$Env:PATH environment variable and sets the prompt to signify that you are
|
||||||
|
in a Python virtual environment. Makes use of the command line switches as
|
||||||
|
well as the `pyvenv.cfg` file values present in the virtual environment.
|
||||||
|
|
||||||
|
.Parameter VenvDir
|
||||||
|
Path to the directory that contains the virtual environment to activate. The
|
||||||
|
default value for this is the parent of the directory that the Activate.ps1
|
||||||
|
script is located within.
|
||||||
|
|
||||||
|
.Parameter Prompt
|
||||||
|
The prompt prefix to display when this virtual environment is activated. By
|
||||||
|
default, this prompt is the name of the virtual environment folder (VenvDir)
|
||||||
|
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
||||||
|
|
||||||
|
.Example
|
||||||
|
Activate.ps1
|
||||||
|
Activates the Python virtual environment that contains the Activate.ps1 script.
|
||||||
|
|
||||||
|
.Example
|
||||||
|
Activate.ps1 -Verbose
|
||||||
|
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||||
|
and shows extra information about the activation as it executes.
|
||||||
|
|
||||||
|
.Example
|
||||||
|
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
||||||
|
Activates the Python virtual environment located in the specified location.
|
||||||
|
|
||||||
|
.Example
|
||||||
|
Activate.ps1 -Prompt "MyPython"
|
||||||
|
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||||
|
and prefixes the current prompt with the specified string (surrounded in
|
||||||
|
parentheses) while the virtual environment is active.
|
||||||
|
|
||||||
|
.Notes
|
||||||
|
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
||||||
|
execution policy for the user. You can do this by issuing the following PowerShell
|
||||||
|
command:
|
||||||
|
|
||||||
|
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||||
|
|
||||||
|
For more information on Execution Policies:
|
||||||
|
https://go.microsoft.com/fwlink/?LinkID=135170
|
||||||
|
|
||||||
|
#>
|
||||||
|
Param(
|
||||||
|
[Parameter(Mandatory = $false)]
|
||||||
|
[String]
|
||||||
|
$VenvDir,
|
||||||
|
[Parameter(Mandatory = $false)]
|
||||||
|
[String]
|
||||||
|
$Prompt
|
||||||
|
)
|
||||||
|
|
||||||
|
<# Function declarations --------------------------------------------------- #>
|
||||||
|
|
||||||
|
<#
|
||||||
|
.Synopsis
|
||||||
|
Remove all shell session elements added by the Activate script, including the
|
||||||
|
addition of the virtual environment's Python executable from the beginning of
|
||||||
|
the PATH variable.
|
||||||
|
|
||||||
|
.Parameter NonDestructive
|
||||||
|
If present, do not remove this function from the global namespace for the
|
||||||
|
session.
|
||||||
|
|
||||||
|
#>
|
||||||
|
function global:deactivate ([switch]$NonDestructive) {
|
||||||
|
# Revert to original values
|
||||||
|
|
||||||
|
# The prior prompt:
|
||||||
|
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
||||||
|
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
||||||
|
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
||||||
|
}
|
||||||
|
|
||||||
|
# The prior PYTHONHOME:
|
||||||
|
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
||||||
|
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
||||||
|
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
||||||
|
}
|
||||||
|
|
||||||
|
# The prior PATH:
|
||||||
|
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
||||||
|
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
||||||
|
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
||||||
|
}
|
||||||
|
|
||||||
|
# Just remove the VIRTUAL_ENV altogether:
|
||||||
|
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
||||||
|
Remove-Item -Path env:VIRTUAL_ENV
|
||||||
|
}
|
||||||
|
|
||||||
|
# Just remove VIRTUAL_ENV_PROMPT altogether.
|
||||||
|
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
|
||||||
|
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
|
||||||
|
}
|
||||||
|
|
||||||
|
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
||||||
|
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
||||||
|
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
||||||
|
}
|
||||||
|
|
||||||
|
# Leave deactivate function in the global namespace if requested:
|
||||||
|
if (-not $NonDestructive) {
|
||||||
|
Remove-Item -Path function:deactivate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
<#
|
||||||
|
.Description
|
||||||
|
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
||||||
|
given folder, and returns them in a map.
|
||||||
|
|
||||||
|
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
||||||
|
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
||||||
|
then it is considered a `key = value` line. The left hand string is the key,
|
||||||
|
the right hand is the value.
|
||||||
|
|
||||||
|
If the value starts with a `'` or a `"` then the first and last character is
|
||||||
|
stripped from the value before being captured.
|
||||||
|
|
||||||
|
.Parameter ConfigDir
|
||||||
|
Path to the directory that contains the `pyvenv.cfg` file.
|
||||||
|
#>
|
||||||
|
function Get-PyVenvConfig(
|
||||||
|
[String]
|
||||||
|
$ConfigDir
|
||||||
|
) {
|
||||||
|
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
||||||
|
|
||||||
|
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
||||||
|
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
||||||
|
|
||||||
|
# An empty map will be returned if no config file is found.
|
||||||
|
$pyvenvConfig = @{ }
|
||||||
|
|
||||||
|
if ($pyvenvConfigPath) {
|
||||||
|
|
||||||
|
Write-Verbose "File exists, parse `key = value` lines"
|
||||||
|
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
||||||
|
|
||||||
|
$pyvenvConfigContent | ForEach-Object {
|
||||||
|
$keyval = $PSItem -split "\s*=\s*", 2
|
||||||
|
if ($keyval[0] -and $keyval[1]) {
|
||||||
|
$val = $keyval[1]
|
||||||
|
|
||||||
|
# Remove extraneous quotations around a string value.
|
||||||
|
if ("'""".Contains($val.Substring(0, 1))) {
|
||||||
|
$val = $val.Substring(1, $val.Length - 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
$pyvenvConfig[$keyval[0]] = $val
|
||||||
|
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $pyvenvConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
<# Begin Activate script --------------------------------------------------- #>
|
||||||
|
|
||||||
|
# Determine the containing directory of this script
|
||||||
|
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||||
|
$VenvExecDir = Get-Item -Path $VenvExecPath
|
||||||
|
|
||||||
|
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
||||||
|
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
||||||
|
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
||||||
|
|
||||||
|
# Set values required in priority: CmdLine, ConfigFile, Default
|
||||||
|
# First, get the location of the virtual environment, it might not be
|
||||||
|
# VenvExecDir if specified on the command line.
|
||||||
|
if ($VenvDir) {
|
||||||
|
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
||||||
|
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
||||||
|
Write-Verbose "VenvDir=$VenvDir"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Next, read the `pyvenv.cfg` file to determine any required value such
|
||||||
|
# as `prompt`.
|
||||||
|
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
||||||
|
|
||||||
|
# Next, set the prompt from the command line, or the config file, or
|
||||||
|
# just use the name of the virtual environment folder.
|
||||||
|
if ($Prompt) {
|
||||||
|
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
||||||
|
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
||||||
|
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
||||||
|
$Prompt = $pyvenvCfg['prompt'];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
||||||
|
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
||||||
|
$Prompt = Split-Path -Path $venvDir -Leaf
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Verbose "Prompt = '$Prompt'"
|
||||||
|
Write-Verbose "VenvDir='$VenvDir'"
|
||||||
|
|
||||||
|
# Deactivate any currently active virtual environment, but leave the
|
||||||
|
# deactivate function in place.
|
||||||
|
deactivate -nondestructive
|
||||||
|
|
||||||
|
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
||||||
|
# that there is an activated venv.
|
||||||
|
$env:VIRTUAL_ENV = $VenvDir
|
||||||
|
|
||||||
|
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
||||||
|
|
||||||
|
Write-Verbose "Setting prompt to '$Prompt'"
|
||||||
|
|
||||||
|
# Set the prompt to include the env name
|
||||||
|
# Make sure _OLD_VIRTUAL_PROMPT is global
|
||||||
|
function global:_OLD_VIRTUAL_PROMPT { "" }
|
||||||
|
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
||||||
|
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
||||||
|
|
||||||
|
function global:prompt {
|
||||||
|
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
||||||
|
_OLD_VIRTUAL_PROMPT
|
||||||
|
}
|
||||||
|
$env:VIRTUAL_ENV_PROMPT = $Prompt
|
||||||
|
}
|
||||||
|
|
||||||
|
# Clear PYTHONHOME
|
||||||
|
if (Test-Path -Path Env:PYTHONHOME) {
|
||||||
|
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
||||||
|
Remove-Item -Path Env:PYTHONHOME
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add the venv to the PATH
|
||||||
|
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
||||||
|
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
||||||
@ -0,0 +1,69 @@
|
|||||||
|
# This file must be used with "source bin/activate" *from bash*
|
||||||
|
# you cannot run it directly
|
||||||
|
|
||||||
|
deactivate () {
|
||||||
|
# reset old environment variables
|
||||||
|
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
|
||||||
|
PATH="${_OLD_VIRTUAL_PATH:-}"
|
||||||
|
export PATH
|
||||||
|
unset _OLD_VIRTUAL_PATH
|
||||||
|
fi
|
||||||
|
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
|
||||||
|
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
|
||||||
|
export PYTHONHOME
|
||||||
|
unset _OLD_VIRTUAL_PYTHONHOME
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This should detect bash and zsh, which have a hash command that must
|
||||||
|
# be called to get it to forget past commands. Without forgetting
|
||||||
|
# past commands the $PATH changes we made may not be respected
|
||||||
|
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
||||||
|
hash -r 2> /dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
|
||||||
|
PS1="${_OLD_VIRTUAL_PS1:-}"
|
||||||
|
export PS1
|
||||||
|
unset _OLD_VIRTUAL_PS1
|
||||||
|
fi
|
||||||
|
|
||||||
|
unset VIRTUAL_ENV
|
||||||
|
unset VIRTUAL_ENV_PROMPT
|
||||||
|
if [ ! "${1:-}" = "nondestructive" ] ; then
|
||||||
|
# Self destruct!
|
||||||
|
unset -f deactivate
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# unset irrelevant variables
|
||||||
|
deactivate nondestructive
|
||||||
|
|
||||||
|
VIRTUAL_ENV="/home/p/Documents/Mapping/scraping/perils/scrapy-env"
|
||||||
|
export VIRTUAL_ENV
|
||||||
|
|
||||||
|
_OLD_VIRTUAL_PATH="$PATH"
|
||||||
|
PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||||
|
export PATH
|
||||||
|
|
||||||
|
# unset PYTHONHOME if set
|
||||||
|
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
|
||||||
|
# could use `if (set -u; : $PYTHONHOME) ;` in bash
|
||||||
|
if [ -n "${PYTHONHOME:-}" ] ; then
|
||||||
|
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
|
||||||
|
unset PYTHONHOME
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
|
||||||
|
_OLD_VIRTUAL_PS1="${PS1:-}"
|
||||||
|
PS1="(scrapy-env) ${PS1:-}"
|
||||||
|
export PS1
|
||||||
|
VIRTUAL_ENV_PROMPT="(scrapy-env) "
|
||||||
|
export VIRTUAL_ENV_PROMPT
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This should detect bash and zsh, which have a hash command that must
|
||||||
|
# be called to get it to forget past commands. Without forgetting
|
||||||
|
# past commands the $PATH changes we made may not be respected
|
||||||
|
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
||||||
|
hash -r 2> /dev/null
|
||||||
|
fi
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
# This file must be used with "source bin/activate.csh" *from csh*.
|
||||||
|
# You cannot run it directly.
|
||||||
|
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
||||||
|
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
||||||
|
|
||||||
|
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
|
||||||
|
|
||||||
|
# Unset irrelevant variables.
|
||||||
|
deactivate nondestructive
|
||||||
|
|
||||||
|
setenv VIRTUAL_ENV "/home/p/Documents/Mapping/scraping/perils/scrapy-env"
|
||||||
|
|
||||||
|
set _OLD_VIRTUAL_PATH="$PATH"
|
||||||
|
setenv PATH "$VIRTUAL_ENV/bin:$PATH"
|
||||||
|
|
||||||
|
|
||||||
|
set _OLD_VIRTUAL_PROMPT="$prompt"
|
||||||
|
|
||||||
|
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
||||||
|
set prompt = "(scrapy-env) $prompt"
|
||||||
|
setenv VIRTUAL_ENV_PROMPT "(scrapy-env) "
|
||||||
|
endif
|
||||||
|
|
||||||
|
alias pydoc python -m pydoc
|
||||||
|
|
||||||
|
rehash
|
||||||
@ -0,0 +1,66 @@
|
|||||||
|
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
||||||
|
# (https://fishshell.com/); you cannot run it directly.
|
||||||
|
|
||||||
|
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
||||||
|
# reset old environment variables
|
||||||
|
if test -n "$_OLD_VIRTUAL_PATH"
|
||||||
|
set -gx PATH $_OLD_VIRTUAL_PATH
|
||||||
|
set -e _OLD_VIRTUAL_PATH
|
||||||
|
end
|
||||||
|
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
||||||
|
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
||||||
|
set -e _OLD_VIRTUAL_PYTHONHOME
|
||||||
|
end
|
||||||
|
|
||||||
|
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
||||||
|
functions -e fish_prompt
|
||||||
|
set -e _OLD_FISH_PROMPT_OVERRIDE
|
||||||
|
functions -c _old_fish_prompt fish_prompt
|
||||||
|
functions -e _old_fish_prompt
|
||||||
|
end
|
||||||
|
|
||||||
|
set -e VIRTUAL_ENV
|
||||||
|
set -e VIRTUAL_ENV_PROMPT
|
||||||
|
if test "$argv[1]" != "nondestructive"
|
||||||
|
# Self-destruct!
|
||||||
|
functions -e deactivate
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Unset irrelevant variables.
|
||||||
|
deactivate nondestructive
|
||||||
|
|
||||||
|
set -gx VIRTUAL_ENV "/home/p/Documents/Mapping/scraping/perils/scrapy-env"
|
||||||
|
|
||||||
|
set -gx _OLD_VIRTUAL_PATH $PATH
|
||||||
|
set -gx PATH "$VIRTUAL_ENV/bin" $PATH
|
||||||
|
|
||||||
|
# Unset PYTHONHOME if set.
|
||||||
|
if set -q PYTHONHOME
|
||||||
|
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
||||||
|
set -e PYTHONHOME
|
||||||
|
end
|
||||||
|
|
||||||
|
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
||||||
|
# fish uses a function instead of an env var to generate the prompt.
|
||||||
|
|
||||||
|
# Save the current fish_prompt function as the function _old_fish_prompt.
|
||||||
|
functions -c fish_prompt _old_fish_prompt
|
||||||
|
|
||||||
|
# With the original prompt function renamed, we can override with our own.
|
||||||
|
function fish_prompt
|
||||||
|
# Save the return status of the last command.
|
||||||
|
set -l old_status $status
|
||||||
|
|
||||||
|
# Output the venv prompt; color taken from the blue of the Python logo.
|
||||||
|
printf "%s%s%s" (set_color 4B8BBE) "(scrapy-env) " (set_color normal)
|
||||||
|
|
||||||
|
# Restore the return status of the previous command.
|
||||||
|
echo "exit $old_status" | .
|
||||||
|
# Output the original/"old" prompt.
|
||||||
|
_old_fish_prompt
|
||||||
|
end
|
||||||
|
|
||||||
|
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
||||||
|
set -gx VIRTUAL_ENV_PROMPT "(scrapy-env) "
|
||||||
|
end
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from automat._visualize import tool
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(tool())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.conch.scripts.cftp import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.conch.scripts.ckeygen import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.conch.scripts.conch import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1,54 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from pprint import pformat
|
||||||
|
|
||||||
|
import jmespath
|
||||||
|
from jmespath import exceptions
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('expression')
|
||||||
|
parser.add_argument('-f', '--filename',
|
||||||
|
help=('The filename containing the input data. '
|
||||||
|
'If a filename is not given then data is '
|
||||||
|
'read from stdin.'))
|
||||||
|
parser.add_argument('--ast', action='store_true',
|
||||||
|
help=('Pretty print the AST, do not search the data.'))
|
||||||
|
args = parser.parse_args()
|
||||||
|
expression = args.expression
|
||||||
|
if args.ast:
|
||||||
|
# Only print the AST
|
||||||
|
expression = jmespath.compile(args.expression)
|
||||||
|
sys.stdout.write(pformat(expression.parsed))
|
||||||
|
sys.stdout.write('\n')
|
||||||
|
return 0
|
||||||
|
if args.filename:
|
||||||
|
with open(args.filename, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
else:
|
||||||
|
data = sys.stdin.read()
|
||||||
|
data = json.loads(data)
|
||||||
|
try:
|
||||||
|
sys.stdout.write(json.dumps(
|
||||||
|
jmespath.search(expression, data), indent=4, ensure_ascii=False))
|
||||||
|
sys.stdout.write('\n')
|
||||||
|
except exceptions.ArityError as e:
|
||||||
|
sys.stderr.write("invalid-arity: %s\n" % e)
|
||||||
|
return 1
|
||||||
|
except exceptions.JMESPathTypeError as e:
|
||||||
|
sys.stderr.write("invalid-type: %s\n" % e)
|
||||||
|
return 1
|
||||||
|
except exceptions.UnknownFunctionError as e:
|
||||||
|
sys.stderr.write("unknown-function: %s\n" % e)
|
||||||
|
return 1
|
||||||
|
except exceptions.ParseError as e:
|
||||||
|
sys.stderr.write("syntax-error: %s\n" % e)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.mail.scripts.mailmail import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from charset_normalizer.cli.normalizer import cli_detect
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(cli_detect())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pip._internal.cli.main import main
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(main())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pip._internal.cli.main import main
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(main())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pip._internal.cli.main import main
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(main())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.scripts.htmlizer import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1 @@
|
|||||||
|
python3
|
||||||
@ -0,0 +1 @@
|
|||||||
|
/usr/bin/python3
|
||||||
@ -0,0 +1 @@
|
|||||||
|
python3
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from scrapy.cmdline import execute
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(execute())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.conch.scripts.tkconch import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from tldextract.cli import main
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(main())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.scripts.trial import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.application.twist._twist import Twist
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(Twist.main())
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from twisted.scripts.twistd import run
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||||
|
sys.exit(run())
|
||||||
@ -0,0 +1 @@
|
|||||||
|
lib
|
||||||
@ -0,0 +1,3 @@
|
|||||||
|
home = /usr/bin
|
||||||
|
include-system-site-packages = false
|
||||||
|
version = 3.10.8
|
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,33 @@
|
|||||||
|
import scrapy
|
||||||
|
# import regex
|
||||||
|
# from scrapy.loader import ItemLoader
|
||||||
|
# from itemloaders.processors import TakeFirst, MapCompose
|
||||||
|
# from w3lib.html import remove_tags
|
||||||
|
|
||||||
|
# # Une fonction qui sépare les adresses en fonction des noms de rue
|
||||||
|
# def splitStreet(value):
|
||||||
|
# # Le regex qui évalue là où il faut séparer la chaîne de caractère
|
||||||
|
# # (présence de "et / - + ainsi que"
|
||||||
|
# # sans que ce séparateur ne soit placé près d'un groupe de numéros de la même rue)
|
||||||
|
# expr = r"((?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\/|((?<!(\d))\s\bet)|(ainsi\sque)|((?<!(\d))\s-\s(?!(bt)|(bis)|(ter)))|(?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\+)"
|
||||||
|
# # Remplacement de chaque séparateur par un pipe ("|")
|
||||||
|
# subst = "|"
|
||||||
|
# repl = regex.sub(expr, subst, value, 0, regex.MULTILINE | regex.IGNORECASE)
|
||||||
|
# # Formattage (supression des espaces insécables et des ":")
|
||||||
|
# filtered = regex.sub('\\xa0|:', '', repl, 0, regex.MULTILINE | regex.IGNORECASE)
|
||||||
|
# # Séparation en liste
|
||||||
|
# splitted = regex.split('\|', filtered, regex.MULTILINE | regex.IGNORECASE)
|
||||||
|
# # Suppression des espaces en début et fin de chaîne
|
||||||
|
# stripped = [x.strip() for x in splitted]
|
||||||
|
# # Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
|
||||||
|
# result = list(filter(None, stripped))
|
||||||
|
# return result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class PerilsItem(scrapy.Item):
|
||||||
|
adrs = scrapy.Field()
|
||||||
|
dernierA = scrapy.Field()
|
||||||
|
As = scrapy.Field()
|
||||||
|
raw = scrapy.Field()
|
||||||
|
pass
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class PerilsPipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
||||||
@ -0,0 +1,93 @@
|
|||||||
|
# Scrapy settings for perils project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = 'perils'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['perils.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'perils.spiders'
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = 'perils (+http://www.yourdomain.com)'
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
#DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
# 'Accept-Language': 'en',
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
SPIDER_MIDDLEWARES = {
|
||||||
|
# 'perils.middlewares.PerilsSpiderMiddleware': 543,
|
||||||
|
'perils.splittermidware.SplitAndSort': 543,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# 'perils.middlewares.PerilsDownloaderMiddleware': 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# 'perils.pipelines.PerilsPipeline': 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = 'httpcache'
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
|
||||||
|
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
|
||||||
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
||||||
@ -0,0 +1,30 @@
|
|||||||
|
import scrapy
|
||||||
|
from perils.items import PerilsItem
|
||||||
|
# from scrapy.loader import ItemLoader
|
||||||
|
|
||||||
|
class ScrapePerils(scrapy.Spider):
|
||||||
|
name = "perils"
|
||||||
|
start_urls = ["https://www.marseille.fr/logement-urbanisme/am%C3%A9lioration-de-lhabitat/arretes-de-peril"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for adresses in response.xpath('//div[@class="card"]//li|//div[@class="card"]//li/p'):
|
||||||
|
item = PerilsItem()
|
||||||
|
# l = ItemLoader(item = PerilsItem(), selector=adresses)
|
||||||
|
|
||||||
|
# l.add_xpath('adrs', './text()')
|
||||||
|
# l.add_xpath('dernierA', './a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()')
|
||||||
|
# if adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get() is None:
|
||||||
|
# l.add_xpath('dernierA', './a[last()]/text()')
|
||||||
|
|
||||||
|
item['adrs'] = adresses.xpath('./text()').get(),
|
||||||
|
item['dernierA'] = adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get(),
|
||||||
|
if adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get() is None:
|
||||||
|
item['dernierA'] = adresses.xpath('./a[last()]/text()').get(),
|
||||||
|
item['As'] = adresses.xpath('./a/text()').getall()
|
||||||
|
item['raw'] = adresses.xpath('.').get()
|
||||||
|
|
||||||
|
yield item
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#response.xpath('//div[@class="card"]//li/text()[1]|//div[@class="card"]//li/p/text()[1]').getall()
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
class ScrapePerils(scrapy.Spider):
|
||||||
|
name = "perils"
|
||||||
|
start_urls = ["https://www.marseille.fr/logement-urbanisme/am%C3%A9lioration-de-lhabitat/arretes-de-peril"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for adresses in response.xpath('//div[@class="card"]//li'):
|
||||||
|
yield {
|
||||||
|
'adresse': adresses.xpath('./text()').get(),
|
||||||
|
'dernier arrêté hors modificatif' : adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get(),
|
||||||
|
'dernier arrêté' : adresses.xpath('./a[last()]/text()').get(),
|
||||||
|
}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = perils.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = perils
|
||||||
Loading…
Reference in new issue